From dfd402a4c4baae42398ce9180ff424d589b8bffc Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:02:54 +0100 Subject: kcsan: Add Kernel Concurrency Sanitizer infrastructure Kernel Concurrency Sanitizer (KCSAN) is a dynamic data-race detector for kernel space. KCSAN is a sampling watchpoint-based data-race detector. See the included Documentation/dev-tools/kcsan.rst for more details. This patch adds basic infrastructure, but does not yet enable KCSAN for any architecture. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/Makefile | 1 + kernel/kcsan/Makefile | 11 + kernel/kcsan/atomic.h | 27 +++ kernel/kcsan/core.c | 626 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kcsan/debugfs.c | 275 +++++++++++++++++++++ kernel/kcsan/encoding.h | 94 ++++++++ kernel/kcsan/kcsan.h | 108 +++++++++ kernel/kcsan/report.c | 320 +++++++++++++++++++++++++ kernel/kcsan/test.c | 121 ++++++++++ 9 files changed, 1583 insertions(+) create mode 100644 kernel/kcsan/Makefile create mode 100644 kernel/kcsan/atomic.h create mode 100644 kernel/kcsan/core.c create mode 100644 kernel/kcsan/debugfs.c create mode 100644 kernel/kcsan/encoding.h create mode 100644 kernel/kcsan/kcsan.h create mode 100644 kernel/kcsan/report.c create mode 100644 kernel/kcsan/test.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index daad787fb795..74ab46e2ebd1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile new file mode 100644 index 000000000000..dd15b62ec0b5 --- /dev/null +++ b/kernel/kcsan/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +KCSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ + $(call cc-option,-fno-stack-protector,) + +obj-y := core.o debugfs.o report.o +obj-$(CONFIG_KCSAN_SELFTEST) += test.o diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h new file mode 100644 index 000000000000..c9c3fe628011 --- /dev/null +++ b/kernel/kcsan/atomic.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_KCSAN_ATOMIC_H +#define _KERNEL_KCSAN_ATOMIC_H + +#include + +/* + * Helper that returns true if access to ptr should be considered as an atomic + * access, even though it is not explicitly atomic. + * + * List all volatile globals that have been observed in races, to suppress + * data race reports between accesses to these variables. + * + * For now, we assume that volatile accesses of globals are as strong as atomic + * accesses (READ_ONCE, WRITE_ONCE cast to volatile). The situation is still not + * entirely clear, as on some architectures (Alpha) READ_ONCE/WRITE_ONCE do more + * than cast to volatile. Eventually, we hope to be able to remove this + * function. + */ +static inline bool kcsan_is_atomic(const volatile void *ptr) +{ + /* only jiffies for now */ + return ptr == &jiffies; +} + +#endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c new file mode 100644 index 000000000000..d9410d58c93e --- /dev/null +++ b/kernel/kcsan/core.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "atomic.h" +#include "encoding.h" +#include "kcsan.h" + +bool kcsan_enabled; + +/* Per-CPU kcsan_ctx for interrupts */ +static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { + .disable_count = 0, + .atomic_next = 0, + .atomic_nest_count = 0, + .in_flat_atomic = false, +}; + +/* + * Helper macros to index into adjacent slots slots, starting from address slot + * itself, followed by the right and left slots. + * + * The purpose is 2-fold: + * + * 1. if during insertion the address slot is already occupied, check if + * any adjacent slots are free; + * 2. accesses that straddle a slot boundary due to size that exceeds a + * slot's range may check adjacent slots if any watchpoint matches. + * + * Note that accesses with very large size may still miss a watchpoint; however, + * given this should be rare, this is a reasonable trade-off to make, since this + * will avoid: + * + * 1. excessive contention between watchpoint checks and setup; + * 2. larger number of simultaneous watchpoints without sacrificing + * performance. + * + * Example: SLOT_IDX values for KCSAN_CHECK_ADJACENT=1, where i is [0, 1, 2]: + * + * slot=0: [ 1, 2, 0] + * slot=9: [10, 11, 9] + * slot=63: [64, 65, 63] + */ +#define NUM_SLOTS (1 + 2 * KCSAN_CHECK_ADJACENT) +#define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) + +/* + * SLOT_IDX_FAST is used in fast-path. Not first checking the address's primary + * slot (middle) is fine if we assume that data races occur rarely. The set of + * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to + * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. + */ +#define SLOT_IDX_FAST(slot, i) (slot + i) + +/* + * Watchpoints, with each entry encoded as defined in encoding.h: in order to be + * able to safely update and access a watchpoint without introducing locking + * overhead, we encode each watchpoint as a single atomic long. The initial + * zero-initialized state matches INVALID_WATCHPOINT. + * + * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to + * use more complicated SLOT_IDX_FAST calculation with modulo in fast-path. + */ +static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS - 1]; + +/* + * Instructions to skip watching counter, used in should_watch(). We use a + * per-CPU counter to avoid excessive contention. + */ +static DEFINE_PER_CPU(long, kcsan_skip); + +static inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, + bool expect_write, + long *encoded_watchpoint) +{ + const int slot = watchpoint_slot(addr); + const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK; + atomic_long_t *watchpoint; + unsigned long wp_addr_masked; + size_t wp_size; + bool is_write; + int i; + + BUILD_BUG_ON(CONFIG_KCSAN_NUM_WATCHPOINTS < NUM_SLOTS); + + for (i = 0; i < NUM_SLOTS; ++i) { + watchpoint = &watchpoints[SLOT_IDX_FAST(slot, i)]; + *encoded_watchpoint = atomic_long_read(watchpoint); + if (!decode_watchpoint(*encoded_watchpoint, &wp_addr_masked, + &wp_size, &is_write)) + continue; + + if (expect_write && !is_write) + continue; + + /* Check if the watchpoint matches the access. */ + if (matching_access(wp_addr_masked, wp_size, addr_masked, size)) + return watchpoint; + } + + return NULL; +} + +static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, + bool is_write) +{ + const int slot = watchpoint_slot(addr); + const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); + atomic_long_t *watchpoint; + int i; + + /* Check slot index logic, ensuring we stay within array bounds. */ + BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT); + BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT + 1) != 0); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, + KCSAN_CHECK_ADJACENT) != + ARRAY_SIZE(watchpoints) - 1); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, + KCSAN_CHECK_ADJACENT + 1) != + ARRAY_SIZE(watchpoints) - NUM_SLOTS); + + for (i = 0; i < NUM_SLOTS; ++i) { + long expect_val = INVALID_WATCHPOINT; + + /* Try to acquire this slot. */ + watchpoint = &watchpoints[SLOT_IDX(slot, i)]; + if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, + encoded_watchpoint)) + return watchpoint; + } + + return NULL; +} + +/* + * Return true if watchpoint was successfully consumed, false otherwise. + * + * This may return false if: + * + * 1. another thread already consumed the watchpoint; + * 2. the thread that set up the watchpoint already removed it; + * 3. the watchpoint was removed and then re-used. + */ +static inline bool try_consume_watchpoint(atomic_long_t *watchpoint, + long encoded_watchpoint) +{ + return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, + CONSUMED_WATCHPOINT); +} + +/* + * Return true if watchpoint was not touched, false if consumed. + */ +static inline bool remove_watchpoint(atomic_long_t *watchpoint) +{ + return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != + CONSUMED_WATCHPOINT; +} + +static inline struct kcsan_ctx *get_ctx(void) +{ + /* + * In interrupt, use raw_cpu_ptr to avoid unnecessary checks, that would + * also result in calls that generate warnings in uaccess regions. + */ + return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); +} + +static inline bool is_atomic(const volatile void *ptr) +{ + struct kcsan_ctx *ctx = get_ctx(); + + if (unlikely(ctx->atomic_next > 0)) { + /* + * Because we do not have separate contexts for nested + * interrupts, in case atomic_next is set, we simply assume that + * the outer interrupt set atomic_next. In the worst case, we + * will conservatively consider operations as atomic. This is a + * reasonable trade-off to make, since this case should be + * extremely rare; however, even if extremely rare, it could + * lead to false positives otherwise. + */ + if ((hardirq_count() >> HARDIRQ_SHIFT) < 2) + --ctx->atomic_next; /* in task, or outer interrupt */ + return true; + } + if (unlikely(ctx->atomic_nest_count > 0 || ctx->in_flat_atomic)) + return true; + + return kcsan_is_atomic(ptr); +} + +static inline bool should_watch(const volatile void *ptr, int type) +{ + /* + * Never set up watchpoints when memory operations are atomic. + * + * Need to check this first, before kcsan_skip check below: (1) atomics + * should not count towards skipped instructions, and (2) to actually + * decrement kcsan_atomic_next for consecutive instruction stream. + */ + if ((type & KCSAN_ACCESS_ATOMIC) != 0 || is_atomic(ptr)) + return false; + + if (this_cpu_dec_return(kcsan_skip) >= 0) + return false; + + /* + * NOTE: If we get here, kcsan_skip must always be reset in slow path + * via reset_kcsan_skip() to avoid underflow. + */ + + /* this operation should be watched */ + return true; +} + +static inline void reset_kcsan_skip(void) +{ + long skip_count = CONFIG_KCSAN_SKIP_WATCH - + (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? + prandom_u32_max(CONFIG_KCSAN_SKIP_WATCH) : + 0); + this_cpu_write(kcsan_skip, skip_count); +} + +static inline bool kcsan_is_enabled(void) +{ + return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; +} + +static inline unsigned int get_delay(void) +{ + unsigned int delay = in_task() ? CONFIG_KCSAN_UDELAY_TASK : + CONFIG_KCSAN_UDELAY_INTERRUPT; + return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? + prandom_u32_max(delay) : + 0); +} + +/* + * Pull everything together: check_access() below contains the performance + * critical operations; the fast-path (including check_access) functions should + * all be inlinable by the instrumentation functions. + * + * The slow-path (kcsan_found_watchpoint, kcsan_setup_watchpoint) are + * non-inlinable -- note that, we prefix these with "kcsan_" to ensure they can + * be filtered from the stacktrace, as well as give them unique names for the + * UACCESS whitelist of objtool. Each function uses user_access_save/restore(), + * since they do not access any user memory, but instrumentation is still + * emitted in UACCESS regions. + */ + +static noinline void kcsan_found_watchpoint(const volatile void *ptr, + size_t size, bool is_write, + atomic_long_t *watchpoint, + long encoded_watchpoint) +{ + unsigned long flags; + bool consumed; + + if (!kcsan_is_enabled()) + return; + /* + * Consume the watchpoint as soon as possible, to minimize the chances + * of !consumed. Consuming the watchpoint must always be guarded by + * kcsan_is_enabled() check, as otherwise we might erroneously + * triggering reports when disabled. + */ + consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint); + + /* keep this after try_consume_watchpoint */ + flags = user_access_save(); + + if (consumed) { + kcsan_report(ptr, size, is_write, true, raw_smp_processor_id(), + KCSAN_REPORT_CONSUMED_WATCHPOINT); + } else { + /* + * The other thread may not print any diagnostics, as it has + * already removed the watchpoint, or another thread consumed + * the watchpoint before this thread. + */ + kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); + } + kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + + user_access_restore(flags); +} + +static noinline void kcsan_setup_watchpoint(const volatile void *ptr, + size_t size, bool is_write) +{ + atomic_long_t *watchpoint; + union { + u8 _1; + u16 _2; + u32 _4; + u64 _8; + } expect_value; + bool value_change = false; + unsigned long ua_flags = user_access_save(); + unsigned long irq_flags; + + /* + * Always reset kcsan_skip counter in slow-path to avoid underflow; see + * should_watch(). + */ + reset_kcsan_skip(); + + if (!kcsan_is_enabled()) + goto out; + + if (!check_encodable((unsigned long)ptr, size)) { + kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); + goto out; + } + + /* + * Disable interrupts & preemptions to avoid another thread on the same + * CPU accessing memory locations for the set up watchpoint; this is to + * avoid reporting races to e.g. CPU-local data. + * + * An alternative would be adding the source CPU to the watchpoint + * encoding, and checking that watchpoint-CPU != this-CPU. There are + * several problems with this: + * 1. we should avoid stealing more bits from the watchpoint encoding + * as it would affect accuracy, as well as increase performance + * overhead in the fast-path; + * 2. if we are preempted, but there *is* a genuine data race, we + * would *not* report it -- since this is the common case (vs. + * CPU-local data accesses), it makes more sense (from a data race + * detection point of view) to simply disable preemptions to ensure + * as many tasks as possible run on other CPUs. + */ + local_irq_save(irq_flags); + + watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); + if (watchpoint == NULL) { + /* + * Out of capacity: the size of `watchpoints`, and the frequency + * with which `should_watch()` returns true should be tweaked so + * that this case happens very rarely. + */ + kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); + goto out_unlock; + } + + kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS); + kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS); + + /* + * Read the current value, to later check and infer a race if the data + * was modified via a non-instrumented access, e.g. from a device. + */ + switch (size) { + case 1: + expect_value._1 = READ_ONCE(*(const u8 *)ptr); + break; + case 2: + expect_value._2 = READ_ONCE(*(const u16 *)ptr); + break; + case 4: + expect_value._4 = READ_ONCE(*(const u32 *)ptr); + break; + case 8: + expect_value._8 = READ_ONCE(*(const u64 *)ptr); + break; + default: + break; /* ignore; we do not diff the values */ + } + + if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { + kcsan_disable_current(); + pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", + is_write ? "write" : "read", size, ptr, + watchpoint_slot((unsigned long)ptr), + encode_watchpoint((unsigned long)ptr, size, is_write)); + kcsan_enable_current(); + } + + /* + * Delay this thread, to increase probability of observing a racy + * conflicting access. + */ + udelay(get_delay()); + + /* + * Re-read value, and check if it is as expected; if not, we infer a + * racy access. + */ + switch (size) { + case 1: + value_change = expect_value._1 != READ_ONCE(*(const u8 *)ptr); + break; + case 2: + value_change = expect_value._2 != READ_ONCE(*(const u16 *)ptr); + break; + case 4: + value_change = expect_value._4 != READ_ONCE(*(const u32 *)ptr); + break; + case 8: + value_change = expect_value._8 != READ_ONCE(*(const u64 *)ptr); + break; + default: + break; /* ignore; we do not diff the values */ + } + + /* Check if this access raced with another. */ + if (!remove_watchpoint(watchpoint)) { + /* + * No need to increment 'data_races' counter, as the racing + * thread already did. + */ + kcsan_report(ptr, size, is_write, size > 8 || value_change, + smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); + } else if (value_change) { + /* Inferring a race, since the value should not have changed. */ + kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + kcsan_report(ptr, size, is_write, true, + smp_processor_id(), + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); + } + + kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); +out_unlock: + local_irq_restore(irq_flags); +out: + user_access_restore(ua_flags); +} + +static __always_inline void check_access(const volatile void *ptr, size_t size, + int type) +{ + const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; + atomic_long_t *watchpoint; + long encoded_watchpoint; + + /* + * Avoid user_access_save in fast-path: find_watchpoint is safe without + * user_access_save, as the address that ptr points to is only used to + * check if a watchpoint exists; ptr is never dereferenced. + */ + watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write, + &encoded_watchpoint); + /* + * It is safe to check kcsan_is_enabled() after find_watchpoint in the + * slow-path, as long as no state changes that cause a data race to be + * detected and reported have occurred until kcsan_is_enabled() is + * checked. + */ + + if (unlikely(watchpoint != NULL)) + kcsan_found_watchpoint(ptr, size, is_write, watchpoint, + encoded_watchpoint); + else if (unlikely(should_watch(ptr, type))) + kcsan_setup_watchpoint(ptr, size, is_write); +} + +/* === Public interface ===================================================== */ + +void __init kcsan_init(void) +{ + BUG_ON(!in_task()); + + kcsan_debugfs_init(); + + /* + * We are in the init task, and no other tasks should be running; + * WRITE_ONCE without memory barrier is sufficient. + */ + if (IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE)) + WRITE_ONCE(kcsan_enabled, true); +} + +/* === Exported interface =================================================== */ + +void kcsan_disable_current(void) +{ + ++get_ctx()->disable_count; +} +EXPORT_SYMBOL(kcsan_disable_current); + +void kcsan_enable_current(void) +{ + if (get_ctx()->disable_count-- == 0) { + /* + * Warn if kcsan_enable_current() calls are unbalanced with + * kcsan_disable_current() calls, which causes disable_count to + * become negative and should not happen. + */ + kcsan_disable_current(); /* restore to 0, KCSAN still enabled */ + kcsan_disable_current(); /* disable to generate warning */ + WARN(1, "Unbalanced %s()", __func__); + kcsan_enable_current(); + } +} +EXPORT_SYMBOL(kcsan_enable_current); + +void kcsan_nestable_atomic_begin(void) +{ + /* + * Do *not* check and warn if we are in a flat atomic region: nestable + * and flat atomic regions are independent from each other. + * See include/linux/kcsan.h: struct kcsan_ctx comments for more + * comments. + */ + + ++get_ctx()->atomic_nest_count; +} +EXPORT_SYMBOL(kcsan_nestable_atomic_begin); + +void kcsan_nestable_atomic_end(void) +{ + if (get_ctx()->atomic_nest_count-- == 0) { + /* + * Warn if kcsan_nestable_atomic_end() calls are unbalanced with + * kcsan_nestable_atomic_begin() calls, which causes + * atomic_nest_count to become negative and should not happen. + */ + kcsan_nestable_atomic_begin(); /* restore to 0 */ + kcsan_disable_current(); /* disable to generate warning */ + WARN(1, "Unbalanced %s()", __func__); + kcsan_enable_current(); + } +} +EXPORT_SYMBOL(kcsan_nestable_atomic_end); + +void kcsan_flat_atomic_begin(void) +{ + get_ctx()->in_flat_atomic = true; +} +EXPORT_SYMBOL(kcsan_flat_atomic_begin); + +void kcsan_flat_atomic_end(void) +{ + get_ctx()->in_flat_atomic = false; +} +EXPORT_SYMBOL(kcsan_flat_atomic_end); + +void kcsan_atomic_next(int n) +{ + get_ctx()->atomic_next = n; +} +EXPORT_SYMBOL(kcsan_atomic_next); + +void __kcsan_check_access(const volatile void *ptr, size_t size, int type) +{ + check_access(ptr, size, type); +} +EXPORT_SYMBOL(__kcsan_check_access); + +/* + * KCSAN uses the same instrumentation that is emitted by supported compilers + * for ThreadSanitizer (TSAN). + * + * When enabled, the compiler emits instrumentation calls (the functions + * prefixed with "__tsan" below) for all loads and stores that it generated; + * inline asm is not instrumented. + * + * Note that, not all supported compiler versions distinguish aligned/unaligned + * accesses, but e.g. recent versions of Clang do. We simply alias the unaligned + * version to the generic version, which can handle both. + */ + +#define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr) \ + { \ + check_access(ptr, size, 0); \ + } \ + EXPORT_SYMBOL(__tsan_read##size); \ + void __tsan_unaligned_read##size(void *ptr) \ + __alias(__tsan_read##size); \ + EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr) \ + { \ + check_access(ptr, size, KCSAN_ACCESS_WRITE); \ + } \ + EXPORT_SYMBOL(__tsan_write##size); \ + void __tsan_unaligned_write##size(void *ptr) \ + __alias(__tsan_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_write##size) + +DEFINE_TSAN_READ_WRITE(1); +DEFINE_TSAN_READ_WRITE(2); +DEFINE_TSAN_READ_WRITE(4); +DEFINE_TSAN_READ_WRITE(8); +DEFINE_TSAN_READ_WRITE(16); + +void __tsan_read_range(void *ptr, size_t size) +{ + check_access(ptr, size, 0); +} +EXPORT_SYMBOL(__tsan_read_range); + +void __tsan_write_range(void *ptr, size_t size) +{ + check_access(ptr, size, KCSAN_ACCESS_WRITE); +} +EXPORT_SYMBOL(__tsan_write_range); + +/* + * The below are not required by KCSAN, but can still be emitted by the + * compiler. + */ +void __tsan_func_entry(void *call_pc) +{ +} +EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void) +{ +} +EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void) +{ +} +EXPORT_SYMBOL(__tsan_init); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c new file mode 100644 index 000000000000..041d520a0183 --- /dev/null +++ b/kernel/kcsan/debugfs.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kcsan.h" + +/* + * Statistics counters. + */ +static atomic_long_t counters[KCSAN_COUNTER_COUNT]; + +/* + * Addresses for filtering functions from reporting. This list can be used as a + * whitelist or blacklist. + */ +static struct { + unsigned long *addrs; /* array of addresses */ + size_t size; /* current size */ + int used; /* number of elements used */ + bool sorted; /* if elements are sorted */ + bool whitelist; /* if list is a blacklist or whitelist */ +} report_filterlist = { + .addrs = NULL, + .size = 8, /* small initial size */ + .used = 0, + .sorted = false, + .whitelist = false, /* default is blacklist */ +}; +static DEFINE_SPINLOCK(report_filterlist_lock); + +static const char *counter_to_name(enum kcsan_counter_id id) +{ + switch (id) { + case KCSAN_COUNTER_USED_WATCHPOINTS: + return "used_watchpoints"; + case KCSAN_COUNTER_SETUP_WATCHPOINTS: + return "setup_watchpoints"; + case KCSAN_COUNTER_DATA_RACES: + return "data_races"; + case KCSAN_COUNTER_NO_CAPACITY: + return "no_capacity"; + case KCSAN_COUNTER_REPORT_RACES: + return "report_races"; + case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: + return "races_unknown_origin"; + case KCSAN_COUNTER_UNENCODABLE_ACCESSES: + return "unencodable_accesses"; + case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: + return "encoding_false_positives"; + case KCSAN_COUNTER_COUNT: + BUG(); + } + return NULL; +} + +void kcsan_counter_inc(enum kcsan_counter_id id) +{ + atomic_long_inc(&counters[id]); +} + +void kcsan_counter_dec(enum kcsan_counter_id id) +{ + atomic_long_dec(&counters[id]); +} + +/* + * The microbenchmark allows benchmarking KCSAN core runtime only. To run + * multiple threads, pipe 'microbench=' from multiple tasks into the + * debugfs file. + */ +static void microbenchmark(unsigned long iters) +{ + cycles_t cycles; + + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + + cycles = get_cycles(); + while (iters--) { + /* + * We can run this benchmark from multiple tasks; this address + * calculation increases likelyhood of some accesses overlapping + * (they still won't conflict because all are reads). + */ + unsigned long addr = + iters % (CONFIG_KCSAN_NUM_WATCHPOINTS * PAGE_SIZE); + __kcsan_check_read((void *)addr, sizeof(long)); + } + cycles = get_cycles() - cycles; + + pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); +} + +static int cmp_filterlist_addrs(const void *rhs, const void *lhs) +{ + const unsigned long a = *(const unsigned long *)rhs; + const unsigned long b = *(const unsigned long *)lhs; + + return a < b ? -1 : a == b ? 0 : 1; +} + +bool kcsan_skip_report_debugfs(unsigned long func_addr) +{ + unsigned long symbolsize, offset; + unsigned long flags; + bool ret = false; + + if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset)) + return false; + func_addr -= offset; /* get function start */ + + spin_lock_irqsave(&report_filterlist_lock, flags); + if (report_filterlist.used == 0) + goto out; + + /* Sort array if it is unsorted, and then do a binary search. */ + if (!report_filterlist.sorted) { + sort(report_filterlist.addrs, report_filterlist.used, + sizeof(unsigned long), cmp_filterlist_addrs, NULL); + report_filterlist.sorted = true; + } + ret = !!bsearch(&func_addr, report_filterlist.addrs, + report_filterlist.used, sizeof(unsigned long), + cmp_filterlist_addrs); + if (report_filterlist.whitelist) + ret = !ret; + +out: + spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; +} + +static void set_report_filterlist_whitelist(bool whitelist) +{ + unsigned long flags; + + spin_lock_irqsave(&report_filterlist_lock, flags); + report_filterlist.whitelist = whitelist; + spin_unlock_irqrestore(&report_filterlist_lock, flags); +} + +/* Returns 0 on success, error-code otherwise. */ +static ssize_t insert_report_filterlist(const char *func) +{ + unsigned long flags; + unsigned long addr = kallsyms_lookup_name(func); + ssize_t ret = 0; + + if (!addr) { + pr_err("KCSAN: could not find function: '%s'\n", func); + return -ENOENT; + } + + spin_lock_irqsave(&report_filterlist_lock, flags); + + if (report_filterlist.addrs == NULL) { + /* initial allocation */ + report_filterlist.addrs = + kmalloc_array(report_filterlist.size, + sizeof(unsigned long), GFP_KERNEL); + if (report_filterlist.addrs == NULL) { + ret = -ENOMEM; + goto out; + } + } else if (report_filterlist.used == report_filterlist.size) { + /* resize filterlist */ + size_t new_size = report_filterlist.size * 2; + unsigned long *new_addrs = + krealloc(report_filterlist.addrs, + new_size * sizeof(unsigned long), GFP_KERNEL); + + if (new_addrs == NULL) { + /* leave filterlist itself untouched */ + ret = -ENOMEM; + goto out; + } + + report_filterlist.size = new_size; + report_filterlist.addrs = new_addrs; + } + + /* Note: deduplicating should be done in userspace. */ + report_filterlist.addrs[report_filterlist.used++] = + kallsyms_lookup_name(func); + report_filterlist.sorted = false; + +out: + spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; +} + +static int show_info(struct seq_file *file, void *v) +{ + int i; + unsigned long flags; + + /* show stats */ + seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); + for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) + seq_printf(file, "%s: %ld\n", counter_to_name(i), + atomic_long_read(&counters[i])); + + /* show filter functions, and filter type */ + spin_lock_irqsave(&report_filterlist_lock, flags); + seq_printf(file, "\n%s functions: %s\n", + report_filterlist.whitelist ? "whitelisted" : "blacklisted", + report_filterlist.used == 0 ? "none" : ""); + for (i = 0; i < report_filterlist.used; ++i) + seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]); + spin_unlock_irqrestore(&report_filterlist_lock, flags); + + return 0; +} + +static int debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_info, NULL); +} + +static ssize_t debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *off) +{ + char kbuf[KSYM_NAME_LEN]; + char *arg; + int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1); + + if (copy_from_user(kbuf, buf, read_len)) + return -EFAULT; + kbuf[read_len] = '\0'; + arg = strstrip(kbuf); + + if (!strcmp(arg, "on")) { + WRITE_ONCE(kcsan_enabled, true); + } else if (!strcmp(arg, "off")) { + WRITE_ONCE(kcsan_enabled, false); + } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) { + unsigned long iters; + + if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) + return -EINVAL; + microbenchmark(iters); + } else if (!strcmp(arg, "whitelist")) { + set_report_filterlist_whitelist(true); + } else if (!strcmp(arg, "blacklist")) { + set_report_filterlist_whitelist(false); + } else if (arg[0] == '!') { + ssize_t ret = insert_report_filterlist(&arg[1]); + + if (ret < 0) + return ret; + } else { + return -EINVAL; + } + + return count; +} + +static const struct file_operations debugfs_ops = { .read = seq_read, + .open = debugfs_open, + .write = debugfs_write, + .release = single_release }; + +void __init kcsan_debugfs_init(void) +{ + debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops); +} diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h new file mode 100644 index 000000000000..e17bdac0e54b --- /dev/null +++ b/kernel/kcsan/encoding.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_KCSAN_ENCODING_H +#define _KERNEL_KCSAN_ENCODING_H + +#include +#include +#include + +#include "kcsan.h" + +#define SLOT_RANGE PAGE_SIZE +#define INVALID_WATCHPOINT 0 +#define CONSUMED_WATCHPOINT 1 + +/* + * The maximum useful size of accesses for which we set up watchpoints is the + * max range of slots we check on an access. + */ +#define MAX_ENCODABLE_SIZE (SLOT_RANGE * (1 + KCSAN_CHECK_ADJACENT)) + +/* + * Number of bits we use to store size info. + */ +#define WATCHPOINT_SIZE_BITS bits_per(MAX_ENCODABLE_SIZE) +/* + * This encoding for addresses discards the upper (1 for is-write + SIZE_BITS); + * however, most 64-bit architectures do not use the full 64-bit address space. + * Also, in order for a false positive to be observable 2 things need to happen: + * + * 1. different addresses but with the same encoded address race; + * 2. and both map onto the same watchpoint slots; + * + * Both these are assumed to be very unlikely. However, in case it still happens + * happens, the report logic will filter out the false positive (see report.c). + */ +#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG - 1 - WATCHPOINT_SIZE_BITS) + +/* + * Masks to set/retrieve the encoded data. + */ +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG - 1) +#define WATCHPOINT_SIZE_MASK \ + GENMASK(BITS_PER_LONG - 2, BITS_PER_LONG - 2 - WATCHPOINT_SIZE_BITS) +#define WATCHPOINT_ADDR_MASK \ + GENMASK(BITS_PER_LONG - 3 - WATCHPOINT_SIZE_BITS, 0) + +static inline bool check_encodable(unsigned long addr, size_t size) +{ + return size <= MAX_ENCODABLE_SIZE; +} + +static inline long encode_watchpoint(unsigned long addr, size_t size, + bool is_write) +{ + return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) | + (size << WATCHPOINT_ADDR_BITS) | + (addr & WATCHPOINT_ADDR_MASK)); +} + +static inline bool decode_watchpoint(long watchpoint, + unsigned long *addr_masked, size_t *size, + bool *is_write) +{ + if (watchpoint == INVALID_WATCHPOINT || + watchpoint == CONSUMED_WATCHPOINT) + return false; + + *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; + *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> + WATCHPOINT_ADDR_BITS; + *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); + + return true; +} + +/* + * Return watchpoint slot for an address. + */ +static inline int watchpoint_slot(unsigned long addr) +{ + return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS; +} + +static inline bool matching_access(unsigned long addr1, size_t size1, + unsigned long addr2, size_t size2) +{ + unsigned long end_range1 = addr1 + size1 - 1; + unsigned long end_range2 = addr2 + size2 - 1; + + return addr1 <= end_range2 && addr2 <= end_range1; +} + +#endif /* _KERNEL_KCSAN_ENCODING_H */ diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h new file mode 100644 index 000000000000..1bb2f1c0d61e --- /dev/null +++ b/kernel/kcsan/kcsan.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please + * see Documentation/dev-tools/kcsan.rst. + */ + +#ifndef _KERNEL_KCSAN_KCSAN_H +#define _KERNEL_KCSAN_KCSAN_H + +#include + +/* The number of adjacent watchpoints to check. */ +#define KCSAN_CHECK_ADJACENT 1 + +/* + * Globally enable and disable KCSAN. + */ +extern bool kcsan_enabled; + +/* + * Initialize debugfs file. + */ +void kcsan_debugfs_init(void); + +enum kcsan_counter_id { + /* + * Number of watchpoints currently in use. + */ + KCSAN_COUNTER_USED_WATCHPOINTS, + + /* + * Total number of watchpoints set up. + */ + KCSAN_COUNTER_SETUP_WATCHPOINTS, + + /* + * Total number of data races. + */ + KCSAN_COUNTER_DATA_RACES, + + /* + * Number of times no watchpoints were available. + */ + KCSAN_COUNTER_NO_CAPACITY, + + /* + * A thread checking a watchpoint raced with another checking thread; + * only one will be reported. + */ + KCSAN_COUNTER_REPORT_RACES, + + /* + * Observed data value change, but writer thread unknown. + */ + KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN, + + /* + * The access cannot be encoded to a valid watchpoint. + */ + KCSAN_COUNTER_UNENCODABLE_ACCESSES, + + /* + * Watchpoint encoding caused a watchpoint to fire on mismatching + * accesses. + */ + KCSAN_COUNTER_ENCODING_FALSE_POSITIVES, + + KCSAN_COUNTER_COUNT, /* number of counters */ +}; + +/* + * Increment/decrement counter with given id; avoid calling these in fast-path. + */ +void kcsan_counter_inc(enum kcsan_counter_id id); +void kcsan_counter_dec(enum kcsan_counter_id id); + +/* + * Returns true if data races in the function symbol that maps to func_addr + * (offsets are ignored) should *not* be reported. + */ +bool kcsan_skip_report_debugfs(unsigned long func_addr); + +enum kcsan_report_type { + /* + * The thread that set up the watchpoint and briefly stalled was + * signalled that another thread triggered the watchpoint. + */ + KCSAN_REPORT_RACE_SIGNAL, + + /* + * A thread found and consumed a matching watchpoint. + */ + KCSAN_REPORT_CONSUMED_WATCHPOINT, + + /* + * No other thread was observed to race with the access, but the data + * value before and after the stall differs. + */ + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, +}; +/* + * Print a race report from thread that encountered the race. + */ +void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type); + +#endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c new file mode 100644 index 000000000000..ead5610bafa7 --- /dev/null +++ b/kernel/kcsan/report.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include "kcsan.h" +#include "encoding.h" + +/* + * Max. number of stack entries to show in the report. + */ +#define NUM_STACK_ENTRIES 64 + +/* + * Other thread info: communicated from other racing thread to thread that set + * up the watchpoint, which then prints the complete report atomically. Only + * need one struct, as all threads should to be serialized regardless to print + * the reports, with reporting being in the slow-path. + */ +static struct { + const volatile void *ptr; + size_t size; + bool is_write; + int task_pid; + int cpu_id; + unsigned long stack_entries[NUM_STACK_ENTRIES]; + int num_stack_entries; +} other_info = { .ptr = NULL }; + +/* + * This spinlock protects reporting and other_info, since other_info is usually + * required when reporting. + */ +static DEFINE_SPINLOCK(report_lock); + +/* + * Special rules to skip reporting. + */ +static bool skip_report(bool is_write, bool value_change, + unsigned long top_frame) +{ + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && + !value_change) { + /* + * The access is a write, but the data value did not change. + * + * We opt-out of this filter for certain functions at request of + * maintainers. + */ + char buf[64]; + + snprintf(buf, sizeof(buf), "%ps", (void *)top_frame); + if (!strnstr(buf, "rcu_", sizeof(buf)) && + !strnstr(buf, "_rcu", sizeof(buf)) && + !strnstr(buf, "_srcu", sizeof(buf))) + return true; + } + + return kcsan_skip_report_debugfs(top_frame); +} + +static inline const char *get_access_type(bool is_write) +{ + return is_write ? "write" : "read"; +} + +/* Return thread description: in task or interrupt. */ +static const char *get_thread_desc(int task_id) +{ + if (task_id != -1) { + static char buf[32]; /* safe: protected by report_lock */ + + snprintf(buf, sizeof(buf), "task %i", task_id); + return buf; + } + return "interrupt"; +} + +/* Helper to skip KCSAN-related functions in stack-trace. */ +static int get_stack_skipnr(unsigned long stack_entries[], int num_entries) +{ + char buf[64]; + int skip = 0; + + for (; skip < num_entries; ++skip) { + snprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); + if (!strnstr(buf, "csan_", sizeof(buf)) && + !strnstr(buf, "tsan_", sizeof(buf)) && + !strnstr(buf, "_once_size", sizeof(buf))) { + break; + } + } + return skip; +} + +/* Compares symbolized strings of addr1 and addr2. */ +static int sym_strcmp(void *addr1, void *addr2) +{ + char buf1[64]; + char buf2[64]; + + snprintf(buf1, sizeof(buf1), "%pS", addr1); + snprintf(buf2, sizeof(buf2), "%pS", addr2); + return strncmp(buf1, buf2, sizeof(buf1)); +} + +/* + * Returns true if a report was generated, false otherwise. + */ +static bool print_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, + enum kcsan_report_type type) +{ + unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; + int num_stack_entries = + stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); + int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + int other_skipnr; + + /* + * Must check report filter rules before starting to print. + */ + if (skip_report(is_write, true, stack_entries[skipnr])) + return false; + + if (type == KCSAN_REPORT_RACE_SIGNAL) { + other_skipnr = get_stack_skipnr(other_info.stack_entries, + other_info.num_stack_entries); + + /* value_change is only known for the other thread */ + if (skip_report(other_info.is_write, value_change, + other_info.stack_entries[other_skipnr])) + return false; + } + + /* Print report header. */ + pr_err("==================================================================\n"); + switch (type) { + case KCSAN_REPORT_RACE_SIGNAL: { + void *this_fn = (void *)stack_entries[skipnr]; + void *other_fn = (void *)other_info.stack_entries[other_skipnr]; + int cmp; + + /* + * Order functions lexographically for consistent bug titles. + * Do not print offset of functions to keep title short. + */ + cmp = sym_strcmp(other_fn, this_fn); + pr_err("BUG: KCSAN: data-race in %ps / %ps\n", + cmp < 0 ? other_fn : this_fn, + cmp < 0 ? this_fn : other_fn); + } break; + + case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: + pr_err("BUG: KCSAN: data-race in %pS\n", + (void *)stack_entries[skipnr]); + break; + + default: + BUG(); + } + + pr_err("\n"); + + /* Print information about the racing accesses. */ + switch (type) { + case KCSAN_REPORT_RACE_SIGNAL: + pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(other_info.is_write), other_info.ptr, + other_info.size, get_thread_desc(other_info.task_pid), + other_info.cpu_id); + + /* Print the other thread's stack trace. */ + stack_trace_print(other_info.stack_entries + other_skipnr, + other_info.num_stack_entries - other_skipnr, + 0); + + pr_err("\n"); + pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(is_write), ptr, size, + get_thread_desc(in_task() ? task_pid_nr(current) : -1), + cpu_id); + break; + + case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: + pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(is_write), ptr, size, + get_thread_desc(in_task() ? task_pid_nr(current) : -1), + cpu_id); + break; + + default: + BUG(); + } + /* Print stack trace of this thread. */ + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + + /* Print report footer. */ + pr_err("\n"); + pr_err("Reported by Kernel Concurrency Sanitizer on:\n"); + dump_stack_print_info(KERN_DEFAULT); + pr_err("==================================================================\n"); + + return true; +} + +static void release_report(unsigned long *flags, enum kcsan_report_type type) +{ + if (type == KCSAN_REPORT_RACE_SIGNAL) + other_info.ptr = NULL; /* mark for reuse */ + + spin_unlock_irqrestore(&report_lock, *flags); +} + +/* + * Depending on the report type either sets other_info and returns false, or + * acquires the matching other_info and returns true. If other_info is not + * required for the report type, simply acquires report_lock and returns true. + */ +static bool prepare_report(unsigned long *flags, const volatile void *ptr, + size_t size, bool is_write, int cpu_id, + enum kcsan_report_type type) +{ + if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && + type != KCSAN_REPORT_RACE_SIGNAL) { + /* other_info not required; just acquire report_lock */ + spin_lock_irqsave(&report_lock, *flags); + return true; + } + +retry: + spin_lock_irqsave(&report_lock, *flags); + + switch (type) { + case KCSAN_REPORT_CONSUMED_WATCHPOINT: + if (other_info.ptr != NULL) + break; /* still in use, retry */ + + other_info.ptr = ptr; + other_info.size = size; + other_info.is_write = is_write; + other_info.task_pid = in_task() ? task_pid_nr(current) : -1; + other_info.cpu_id = cpu_id; + other_info.num_stack_entries = stack_trace_save( + other_info.stack_entries, NUM_STACK_ENTRIES, 1); + + spin_unlock_irqrestore(&report_lock, *flags); + + /* + * The other thread will print the summary; other_info may now + * be consumed. + */ + return false; + + case KCSAN_REPORT_RACE_SIGNAL: + if (other_info.ptr == NULL) + break; /* no data available yet, retry */ + + /* + * First check if this is the other_info we are expecting, i.e. + * matches based on how watchpoint was encoded. + */ + if (!matching_access((unsigned long)other_info.ptr & + WATCHPOINT_ADDR_MASK, + other_info.size, + (unsigned long)ptr & WATCHPOINT_ADDR_MASK, + size)) + break; /* mismatching watchpoint, retry */ + + if (!matching_access((unsigned long)other_info.ptr, + other_info.size, (unsigned long)ptr, + size)) { + /* + * If the actual accesses to not match, this was a false + * positive due to watchpoint encoding. + */ + kcsan_counter_inc( + KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + + /* discard this other_info */ + release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + return false; + } + + /* + * Matching & usable access in other_info: keep other_info_lock + * locked, as this thread consumes it to print the full report; + * unlocked in release_report. + */ + return true; + + default: + BUG(); + } + + spin_unlock_irqrestore(&report_lock, *flags); + goto retry; +} + +void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type) +{ + unsigned long flags = 0; + + kcsan_disable_current(); + if (prepare_report(&flags, ptr, size, is_write, cpu_id, type)) { + if (print_report(ptr, size, is_write, value_change, cpu_id, + type) && + panic_on_warn) + panic("panic_on_warn set ...\n"); + + release_report(&flags, type); + } + kcsan_enable_current(); +} diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c new file mode 100644 index 000000000000..0bae63c5ca65 --- /dev/null +++ b/kernel/kcsan/test.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "encoding.h" + +#define ITERS_PER_TEST 2000 + +/* Test requirements. */ +static bool test_requires(void) +{ + /* random should be initialized for the below tests */ + return prandom_u32() + prandom_u32() != 0; +} + +/* + * Test watchpoint encode and decode: check that encoding some access's info, + * and then subsequent decode preserves the access's info. + */ +static bool test_encode_decode(void) +{ + int i; + + for (i = 0; i < ITERS_PER_TEST; ++i) { + size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; + bool is_write = !!prandom_u32_max(2); + unsigned long addr; + + prandom_bytes(&addr, sizeof(addr)); + if (WARN_ON(!check_encodable(addr, size))) + return false; + + /* encode and decode */ + { + const long encoded_watchpoint = + encode_watchpoint(addr, size, is_write); + unsigned long verif_masked_addr; + size_t verif_size; + bool verif_is_write; + + /* check special watchpoints */ + if (WARN_ON(decode_watchpoint( + INVALID_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(decode_watchpoint( + CONSUMED_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + + /* check decoding watchpoint returns same data */ + if (WARN_ON(!decode_watchpoint( + encoded_watchpoint, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(verif_masked_addr != + (addr & WATCHPOINT_ADDR_MASK))) + goto fail; + if (WARN_ON(verif_size != size)) + goto fail; + if (WARN_ON(is_write != verif_is_write)) + goto fail; + + continue; +fail: + pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n", + __func__, is_write ? "write" : "read", size, + addr, encoded_watchpoint, + verif_is_write ? "write" : "read", verif_size, + verif_masked_addr); + return false; + } + } + + return true; +} + +/* Test access matching function. */ +static bool test_matching_access(void) +{ + if (WARN_ON(!matching_access(10, 1, 10, 1))) + return false; + if (WARN_ON(!matching_access(10, 2, 11, 1))) + return false; + if (WARN_ON(!matching_access(10, 1, 9, 2))) + return false; + if (WARN_ON(matching_access(10, 1, 11, 1))) + return false; + if (WARN_ON(matching_access(9, 1, 10, 1))) + return false; + return true; +} + +static int __init kcsan_selftest(void) +{ + int passed = 0; + int total = 0; + +#define RUN_TEST(do_test) \ + do { \ + ++total; \ + if (do_test()) \ + ++passed; \ + else \ + pr_err("KCSAN selftest: " #do_test " failed"); \ + } while (0) + + RUN_TEST(test_requires); + RUN_TEST(test_encode_decode); + RUN_TEST(test_matching_access); + + pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + if (passed != total) + panic("KCSAN selftests failed"); + return 0; +} +postcore_initcall(kcsan_selftest); -- cgit v1.2.3 From 0ebba7141eadc4804ec5b4bb5106331b0c3abf4c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:02:58 +0100 Subject: build, kcsan: Add KCSAN build exceptions This blacklists several compilation units from KCSAN. See the respective inline comments for the reasoning. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/Makefile | 5 +++++ kernel/sched/Makefile | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 74ab46e2ebd1..cc53f7c25446 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,9 @@ endif # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() # in coverage traces. KCOV_INSTRUMENT_softirq.o := n +# Avoid KCSAN instrumentation in softirq ("No shared variables, all the data +# are CPU local" => assume no data races), to reduce overhead in interrupts. +KCSAN_SANITIZE_softirq.o = n # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. KCOV_INSTRUMENT_module.o := n @@ -30,6 +33,7 @@ KCOV_INSTRUMENT_extable.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n +KCSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) # cond_syscall is currently not LTO compatible @@ -118,6 +122,7 @@ obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n +KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n $(obj)/configs.o: $(obj)/config_data.gz diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 21fb5a5662b5..e9307a9c54e7 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -7,6 +7,12 @@ endif # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n +# There are numerous races here, however, most of them due to plain accesses. +# This would make it even harder for syzbot to find reproducers, because these +# bugs trigger without specific input. Disable by default, but should re-enable +# eventually. +KCSAN_SANITIZE := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond -- cgit v1.2.3 From 5cbaefe9743bf14c9d3106db0cc19f8cb0a3ca22 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Nov 2019 10:41:43 +0100 Subject: kcsan: Improve various small stylistic details Tidy up a few bits: - Fix typos and grammar, improve wording. - Remove spurious newlines that are col80 warning artifacts where the resulting line-break is worse than the disease it's curing. - Use core kernel coding style to improve readability and reduce spurious code pattern variations. - Use better vertical alignment for structure definitions and initialization sequences. - Misc other small details. No change in functionality intended. Cc: linux-kernel@vger.kernel.org Cc: Marco Elver Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Will Deacon Signed-off-by: Ingo Molnar --- kernel/kcsan/atomic.h | 2 +- kernel/kcsan/core.c | 59 +++++++++++++++++++++------------------------- kernel/kcsan/debugfs.c | 62 +++++++++++++++++++++++-------------------------- kernel/kcsan/encoding.h | 25 ++++++++++---------- kernel/kcsan/kcsan.h | 11 +++++---- kernel/kcsan/report.c | 42 ++++++++++++++++----------------- kernel/kcsan/test.c | 6 ++--- kernel/sched/Makefile | 2 +- 8 files changed, 100 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index c9c3fe628011..576e03ddd6a3 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -6,7 +6,7 @@ #include /* - * Helper that returns true if access to ptr should be considered as an atomic + * Helper that returns true if access to @ptr should be considered an atomic * access, even though it is not explicitly atomic. * * List all volatile globals that have been observed in races, to suppress diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index d9410d58c93e..3314fc29e236 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -19,10 +19,10 @@ bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { - .disable_count = 0, - .atomic_next = 0, - .atomic_nest_count = 0, - .in_flat_atomic = false, + .disable_count = 0, + .atomic_next = 0, + .atomic_nest_count = 0, + .in_flat_atomic = false, }; /* @@ -50,11 +50,11 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { * slot=9: [10, 11, 9] * slot=63: [64, 65, 63] */ -#define NUM_SLOTS (1 + 2 * KCSAN_CHECK_ADJACENT) +#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) #define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) /* - * SLOT_IDX_FAST is used in fast-path. Not first checking the address's primary + * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary * slot (middle) is fine if we assume that data races occur rarely. The set of * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. @@ -68,9 +68,9 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { * zero-initialized state matches INVALID_WATCHPOINT. * * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to - * use more complicated SLOT_IDX_FAST calculation with modulo in fast-path. + * use more complicated SLOT_IDX_FAST calculation with modulo in the fast-path. */ -static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS - 1]; +static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; /* * Instructions to skip watching counter, used in should_watch(). We use a @@ -78,7 +78,8 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS - 1]; */ static DEFINE_PER_CPU(long, kcsan_skip); -static inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, +static inline atomic_long_t *find_watchpoint(unsigned long addr, + size_t size, bool expect_write, long *encoded_watchpoint) { @@ -110,8 +111,8 @@ static inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, return NULL; } -static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, - bool is_write) +static inline atomic_long_t * +insert_watchpoint(unsigned long addr, size_t size, bool is_write) { const int slot = watchpoint_slot(addr); const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); @@ -120,21 +121,16 @@ static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, /* Check slot index logic, ensuring we stay within array bounds. */ BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT); - BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT + 1) != 0); - BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, - KCSAN_CHECK_ADJACENT) != - ARRAY_SIZE(watchpoints) - 1); - BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, - KCSAN_CHECK_ADJACENT + 1) != - ARRAY_SIZE(watchpoints) - NUM_SLOTS); + BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT+1) != 0); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT) != ARRAY_SIZE(watchpoints)-1); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT+1) != ARRAY_SIZE(watchpoints) - NUM_SLOTS); for (i = 0; i < NUM_SLOTS; ++i) { long expect_val = INVALID_WATCHPOINT; /* Try to acquire this slot. */ watchpoint = &watchpoints[SLOT_IDX(slot, i)]; - if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, - encoded_watchpoint)) + if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, encoded_watchpoint)) return watchpoint; } @@ -150,11 +146,10 @@ static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, * 2. the thread that set up the watchpoint already removed it; * 3. the watchpoint was removed and then re-used. */ -static inline bool try_consume_watchpoint(atomic_long_t *watchpoint, - long encoded_watchpoint) +static inline bool +try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) { - return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, - CONSUMED_WATCHPOINT); + return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); } /* @@ -162,14 +157,13 @@ static inline bool try_consume_watchpoint(atomic_long_t *watchpoint, */ static inline bool remove_watchpoint(atomic_long_t *watchpoint) { - return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != - CONSUMED_WATCHPOINT; + return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; } static inline struct kcsan_ctx *get_ctx(void) { /* - * In interrupt, use raw_cpu_ptr to avoid unnecessary checks, that would + * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would * also result in calls that generate warnings in uaccess regions. */ return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); @@ -260,7 +254,8 @@ static inline unsigned int get_delay(void) */ static noinline void kcsan_found_watchpoint(const volatile void *ptr, - size_t size, bool is_write, + size_t size, + bool is_write, atomic_long_t *watchpoint, long encoded_watchpoint) { @@ -296,8 +291,8 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, user_access_restore(flags); } -static noinline void kcsan_setup_watchpoint(const volatile void *ptr, - size_t size, bool is_write) +static noinline void +kcsan_setup_watchpoint(const volatile void *ptr, size_t size, bool is_write) { atomic_long_t *watchpoint; union { @@ -346,8 +341,8 @@ static noinline void kcsan_setup_watchpoint(const volatile void *ptr, watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { /* - * Out of capacity: the size of `watchpoints`, and the frequency - * with which `should_watch()` returns true should be tweaked so + * Out of capacity: the size of 'watchpoints', and the frequency + * with which should_watch() returns true should be tweaked so * that this case happens very rarely. */ kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 041d520a0183..bec42dab32ee 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -24,39 +24,31 @@ static atomic_long_t counters[KCSAN_COUNTER_COUNT]; * whitelist or blacklist. */ static struct { - unsigned long *addrs; /* array of addresses */ - size_t size; /* current size */ - int used; /* number of elements used */ - bool sorted; /* if elements are sorted */ - bool whitelist; /* if list is a blacklist or whitelist */ + unsigned long *addrs; /* array of addresses */ + size_t size; /* current size */ + int used; /* number of elements used */ + bool sorted; /* if elements are sorted */ + bool whitelist; /* if list is a blacklist or whitelist */ } report_filterlist = { - .addrs = NULL, - .size = 8, /* small initial size */ - .used = 0, - .sorted = false, - .whitelist = false, /* default is blacklist */ + .addrs = NULL, + .size = 8, /* small initial size */ + .used = 0, + .sorted = false, + .whitelist = false, /* default is blacklist */ }; static DEFINE_SPINLOCK(report_filterlist_lock); static const char *counter_to_name(enum kcsan_counter_id id) { switch (id) { - case KCSAN_COUNTER_USED_WATCHPOINTS: - return "used_watchpoints"; - case KCSAN_COUNTER_SETUP_WATCHPOINTS: - return "setup_watchpoints"; - case KCSAN_COUNTER_DATA_RACES: - return "data_races"; - case KCSAN_COUNTER_NO_CAPACITY: - return "no_capacity"; - case KCSAN_COUNTER_REPORT_RACES: - return "report_races"; - case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: - return "races_unknown_origin"; - case KCSAN_COUNTER_UNENCODABLE_ACCESSES: - return "unencodable_accesses"; - case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: - return "encoding_false_positives"; + case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; + case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; + case KCSAN_COUNTER_DATA_RACES: return "data_races"; + case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; + case KCSAN_COUNTER_REPORT_RACES: return "report_races"; + case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; + case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; + case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; case KCSAN_COUNTER_COUNT: BUG(); } @@ -116,7 +108,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr) if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset)) return false; - func_addr -= offset; /* get function start */ + func_addr -= offset; /* Get function start */ spin_lock_irqsave(&report_filterlist_lock, flags); if (report_filterlist.used == 0) @@ -195,6 +187,7 @@ static ssize_t insert_report_filterlist(const char *func) out: spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; } @@ -226,8 +219,8 @@ static int debugfs_open(struct inode *inode, struct file *file) return single_open(file, show_info, NULL); } -static ssize_t debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *off) +static ssize_t +debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *off) { char kbuf[KSYM_NAME_LEN]; char *arg; @@ -264,10 +257,13 @@ static ssize_t debugfs_write(struct file *file, const char __user *buf, return count; } -static const struct file_operations debugfs_ops = { .read = seq_read, - .open = debugfs_open, - .write = debugfs_write, - .release = single_release }; +static const struct file_operations debugfs_ops = +{ + .read = seq_read, + .open = debugfs_open, + .write = debugfs_write, + .release = single_release +}; void __init kcsan_debugfs_init(void) { diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index e17bdac0e54b..b63890e86449 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -10,7 +10,8 @@ #include "kcsan.h" #define SLOT_RANGE PAGE_SIZE -#define INVALID_WATCHPOINT 0 + +#define INVALID_WATCHPOINT 0 #define CONSUMED_WATCHPOINT 1 /* @@ -34,24 +35,24 @@ * Both these are assumed to be very unlikely. However, in case it still happens * happens, the report logic will filter out the false positive (see report.c). */ -#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG - 1 - WATCHPOINT_SIZE_BITS) +#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) /* * Masks to set/retrieve the encoded data. */ -#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG - 1) +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) #define WATCHPOINT_SIZE_MASK \ - GENMASK(BITS_PER_LONG - 2, BITS_PER_LONG - 2 - WATCHPOINT_SIZE_BITS) + GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS) #define WATCHPOINT_ADDR_MASK \ - GENMASK(BITS_PER_LONG - 3 - WATCHPOINT_SIZE_BITS, 0) + GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0) static inline bool check_encodable(unsigned long addr, size_t size) { return size <= MAX_ENCODABLE_SIZE; } -static inline long encode_watchpoint(unsigned long addr, size_t size, - bool is_write) +static inline long +encode_watchpoint(unsigned long addr, size_t size, bool is_write) { return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) | (size << WATCHPOINT_ADDR_BITS) | @@ -59,17 +60,17 @@ static inline long encode_watchpoint(unsigned long addr, size_t size, } static inline bool decode_watchpoint(long watchpoint, - unsigned long *addr_masked, size_t *size, + unsigned long *addr_masked, + size_t *size, bool *is_write) { if (watchpoint == INVALID_WATCHPOINT || watchpoint == CONSUMED_WATCHPOINT) return false; - *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; - *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> - WATCHPOINT_ADDR_BITS; - *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); + *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; + *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> WATCHPOINT_ADDR_BITS; + *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); return true; } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 1bb2f1c0d61e..d3b9a96ac8a4 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -72,14 +72,14 @@ enum kcsan_counter_id { /* * Increment/decrement counter with given id; avoid calling these in fast-path. */ -void kcsan_counter_inc(enum kcsan_counter_id id); -void kcsan_counter_dec(enum kcsan_counter_id id); +extern void kcsan_counter_inc(enum kcsan_counter_id id); +extern void kcsan_counter_dec(enum kcsan_counter_id id); /* * Returns true if data races in the function symbol that maps to func_addr * (offsets are ignored) should *not* be reported. */ -bool kcsan_skip_report_debugfs(unsigned long func_addr); +extern bool kcsan_skip_report_debugfs(unsigned long func_addr); enum kcsan_report_type { /* @@ -99,10 +99,11 @@ enum kcsan_report_type { */ KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, }; + /* * Print a race report from thread that encountered the race. */ -void kcsan_report(const volatile void *ptr, size_t size, bool is_write, - bool value_change, int cpu_id, enum kcsan_report_type type); +extern void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ead5610bafa7..0eea05a3135b 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -22,13 +22,13 @@ * the reports, with reporting being in the slow-path. */ static struct { - const volatile void *ptr; - size_t size; - bool is_write; - int task_pid; - int cpu_id; - unsigned long stack_entries[NUM_STACK_ENTRIES]; - int num_stack_entries; + const volatile void *ptr; + size_t size; + bool is_write; + int task_pid; + int cpu_id; + unsigned long stack_entries[NUM_STACK_ENTRIES]; + int num_stack_entries; } other_info = { .ptr = NULL }; /* @@ -40,8 +40,8 @@ static DEFINE_SPINLOCK(report_lock); /* * Special rules to skip reporting. */ -static bool skip_report(bool is_write, bool value_change, - unsigned long top_frame) +static bool +skip_report(bool is_write, bool value_change, unsigned long top_frame) { if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && !value_change) { @@ -105,6 +105,7 @@ static int sym_strcmp(void *addr1, void *addr2) snprintf(buf1, sizeof(buf1), "%pS", addr1); snprintf(buf2, sizeof(buf2), "%pS", addr2); + return strncmp(buf1, buf2, sizeof(buf1)); } @@ -116,8 +117,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, enum kcsan_report_type type) { unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; - int num_stack_entries = - stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); + int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); int other_skipnr; @@ -131,7 +131,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, other_skipnr = get_stack_skipnr(other_info.stack_entries, other_info.num_stack_entries); - /* value_change is only known for the other thread */ + /* @value_change is only known for the other thread */ if (skip_report(other_info.is_write, value_change, other_info.stack_entries[other_skipnr])) return false; @@ -241,13 +241,12 @@ retry: if (other_info.ptr != NULL) break; /* still in use, retry */ - other_info.ptr = ptr; - other_info.size = size; - other_info.is_write = is_write; - other_info.task_pid = in_task() ? task_pid_nr(current) : -1; - other_info.cpu_id = cpu_id; - other_info.num_stack_entries = stack_trace_save( - other_info.stack_entries, NUM_STACK_ENTRIES, 1); + other_info.ptr = ptr; + other_info.size = size; + other_info.is_write = is_write; + other_info.task_pid = in_task() ? task_pid_nr(current) : -1; + other_info.cpu_id = cpu_id; + other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); spin_unlock_irqrestore(&report_lock, *flags); @@ -299,6 +298,7 @@ retry: } spin_unlock_irqrestore(&report_lock, *flags); + goto retry; } @@ -309,9 +309,7 @@ void kcsan_report(const volatile void *ptr, size_t size, bool is_write, kcsan_disable_current(); if (prepare_report(&flags, ptr, size, is_write, cpu_id, type)) { - if (print_report(ptr, size, is_write, value_change, cpu_id, - type) && - panic_on_warn) + if (print_report(ptr, size, is_write, value_change, cpu_id, type) && panic_on_warn) panic("panic_on_warn set ...\n"); release_report(&flags, type); diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c index 0bae63c5ca65..cc6000239dc0 100644 --- a/kernel/kcsan/test.c +++ b/kernel/kcsan/test.c @@ -34,7 +34,7 @@ static bool test_encode_decode(void) if (WARN_ON(!check_encodable(addr, size))) return false; - /* encode and decode */ + /* Encode and decode */ { const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); @@ -42,7 +42,7 @@ static bool test_encode_decode(void) size_t verif_size; bool verif_is_write; - /* check special watchpoints */ + /* Check special watchpoints */ if (WARN_ON(decode_watchpoint( INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write))) @@ -52,7 +52,7 @@ static bool test_encode_decode(void) &verif_size, &verif_is_write))) return false; - /* check decoding watchpoint returns same data */ + /* Check decoding watchpoint returns same data */ if (WARN_ON(!decode_watchpoint( encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write))) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e9307a9c54e7..5fc9c9b70862 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -7,7 +7,7 @@ endif # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n -# There are numerous races here, however, most of them due to plain accesses. +# There are numerous data races here, however, most of them are due to plain accesses. # This would make it even harder for syzbot to find reproducers, because these # bugs trigger without specific input. Disable by default, but should re-enable # eventually. -- cgit v1.2.3 From d47715f50e833f12c5e829ce9dcc4a65104fa74f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 19 Nov 2019 19:57:42 +0100 Subject: kcsan, ubsan: Make KCSAN+UBSAN work together Context: http://lkml.kernel.org/r/fb7e25d8-aba4-3dcf-7761-cb7ecb3ebb71@infradead.org Reported-by: Randy Dunlap Signed-off-by: Marco Elver Acked-by: Randy Dunlap # build-tested Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index dd15b62ec0b5..df6b7799e492 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 KCSAN_SANITIZE := n KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) -- cgit v1.2.3 From 5661dd95a2958634485bb1a53f90a6ab621d4b0c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Jan 2020 15:16:44 -0700 Subject: printk: Convert a use of sprintf to snprintf in console_unlock When CONFIG_PRINTK is disabled (e.g. when building allnoconfig), clang warns: ../kernel/printk/printk.c:2416:10: warning: 'sprintf' will always overflow; destination buffer has size 0, but format string expands to at least 33 [-Wfortify-source] len = sprintf(text, ^ 1 warning generated. It is not wrong; text has a zero size when CONFIG_PRINTK is disabled because LOG_LINE_MAX and PREFIX_MAX are both zero. Change to snprintf so that this case is explicitly handled without any risk of overflow. Link: https://github.com/ClangBuiltLinux/linux/issues/846 Link: https://github.com/llvm/llvm-project/commit/6d485ff455ea2b37fef9e06e426dae6c1241b231 Link: http://lkml.kernel.org/r/20200130221644.2273-1-natechancellor@gmail.com Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Cc: clang-built-linux@googlegroups.com Signed-off-by: Nathan Chancellor Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fada22dc4ab6..a44094727a5c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2413,9 +2413,9 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); + len = snprintf(text, sizeof(text), + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; -- cgit v1.2.3 From ad8cd1db80cc33337bdbee63c453ef6d5132474b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 13 Feb 2020 10:51:31 +0100 Subject: printk: Move console matching logic into a separate function This moves the loop that tries to match a newly registered console with the command line or add_preferred_console list into a separate helper, in order to be able to call it multiple times in subsequent patches. Link: https://lore.kernel.org/r/20200213095133.23176-2-pmladek@suse.com Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 105 ++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fada22dc4ab6..0ebcdf53e75d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,6 +280,7 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; +static bool has_preferred_console; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -2626,6 +2627,60 @@ static int __init keep_bootcon_setup(char *str) early_param("keep_bootcon", keep_bootcon_setup); +/* + * This is called by register_console() to try to match + * the newly registered console with any of the ones selected + * by either the command line or add_preferred_console() and + * setup/enable it. + * + * Care need to be taken with consoles that are statically + * enabled such as netconsole + */ +static int try_enable_new_console(struct console *newcon) +{ + struct console_cmdline *c; + int i; + + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (!newcon->match || + newcon->match(newcon, c->name, c->index, c->options) != 0) { + /* default matching */ + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); + if (strcmp(c->name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != c->index) + continue; + if (newcon->index < 0) + newcon->index = c->index; + + if (_braille_register_console(newcon, c)) + return 0; + + if (newcon->setup && + newcon->setup(newcon, c->options) != 0) + return -EIO; + } + newcon->flags |= CON_ENABLED; + if (i == preferred_console) { + newcon->flags |= CON_CONSDEV; + has_preferred_console = true; + } + return 0; + } + + /* + * Some consoles, such as pstore and netconsole, can be enabled even + * without matching. + */ + if (newcon->flags & CON_ENABLED) + return 0; + + return -ENOENT; +} + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -2647,11 +2702,9 @@ early_param("keep_bootcon", keep_bootcon_setup); */ void register_console(struct console *newcon) { - int i; unsigned long flags; struct console *bcon = NULL; - struct console_cmdline *c; - static bool has_preferred; + int err; if (console_drivers) for_each_console(bcon) @@ -2678,15 +2731,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (!has_preferred || bcon || !console_drivers) - has_preferred = preferred_console >= 0; + if (!has_preferred_console || bcon || !console_drivers) + has_preferred_console = preferred_console >= 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (!has_preferred) { + if (!has_preferred_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -2694,47 +2747,19 @@ void register_console(struct console *newcon) newcon->flags |= CON_ENABLED; if (newcon->device) { newcon->flags |= CON_CONSDEV; - has_preferred = true; + has_preferred_console = true; } } } /* - * See if this console matches one we selected on - * the command line. + * See if this console matches one we selected on + * the command line or if it was statically enabled */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { - if (!newcon->match || - newcon->match(newcon, c->name, c->index, c->options) != 0) { - /* default matching */ - BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); - if (strcmp(c->name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != c->index) - continue; - if (newcon->index < 0) - newcon->index = c->index; - - if (_braille_register_console(newcon, c)) - return; - - if (newcon->setup && - newcon->setup(newcon, c->options) != 0) - break; - } - - newcon->flags |= CON_ENABLED; - if (i == preferred_console) { - newcon->flags |= CON_CONSDEV; - has_preferred = true; - } - break; - } + err = try_enable_new_console(newcon); - if (!(newcon->flags & CON_ENABLED)) + /* printk() messages are not printed to the Braille console. */ + if (err || newcon->flags & CON_BRL) return; /* -- cgit v1.2.3 From e369d8227fd211be36242fc44a9dc2209e246b9a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 13 Feb 2020 10:51:32 +0100 Subject: printk: Fix preferred console selection with multiple matches In the following circumstances, the rule of selecting the console corresponding to the last "console=" entry on the command line as the preferred console (CON_CONSDEV, ie, /dev/console) fails. This is a specific example, but it could happen with different consoles that have a similar name aliasing mechanism. - The kernel command line has both console=tty0 and console=ttyS0 in that order (the latter with speed etc... arguments). This is common with some cloud setups such as Amazon Linux. - add_preferred_console is called early to register "uart0". In our case that happens from acpi_parse_spcr() on arm64 since the "enable_console" argument is true on that architecture. This causes "uart0" to become entry 0 of the console_cmdline array. Now, because of the above, what happens is: - add_preferred_console is called by the cmdline parsing for tty0 and ttyS0 respectively, thus occupying entries 1 and 2 of the console_cmdline array (since this happens after ACPI SPCR parsing). At that point preferred_console is set to 2 as expected. - When the tty layer kicks in, it will call register_console for tty0. This will match entry 1 in console_cmdline array. It isn't our preferred console but because it's our only console at this point, it will end up "first" in the consoles list. - When 8250 probes the actual serial port later on, it calls register_console for ttyS0. At that point the loop in register_console tries to match it with the entries in the console_cmdline array. Ideally this should match ttyS0 in entry 2, which is preferred, causing it to be inserted first and to replace tty0 as CONSDEV. However, 8250 provides a "match" hook in its struct console, and that hook will match "uart" as an alias to "ttyS". So we match uart0 at entry 0 in the array which is not the preferred console and will not match entry 2 which is since we break out of the loop on the first match. As a result, we don't set CONSDEV and don't insert it first, but second in the console list. As a result, we end up with tty0 remaining first in the array, and thus /dev/console going there instead of the last user specified one which is ttyS0. This tentative fix register_console() to scan first for consoles specified on the command line, and only if none is found, to then scan for consoles specified by the architecture. Link: https://lore.kernel.org/r/20200213095133.23176-3-pmladek@suse.com Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/console_cmdline.h | 1 + kernel/printk/printk.c | 29 ++++++++++++++++++----------- 2 files changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index 11f19c466af5..3ca74ad391d6 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -6,6 +6,7 @@ struct console_cmdline { char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ + bool user_specified; /* Specified by command line vs. platform */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE char *brl_options; /* Options for braille driver */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0ebcdf53e75d..f76ef3f0efca 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2116,7 +2116,7 @@ asmlinkage __visible void early_printk(const char *fmt, ...) #endif static int __add_preferred_console(char *name, int idx, char *options, - char *brl_options) + char *brl_options, bool user_specified) { struct console_cmdline *c; int i; @@ -2131,6 +2131,8 @@ static int __add_preferred_console(char *name, int idx, char *options, if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; + if (user_specified) + c->user_specified = true; return 0; } } @@ -2140,6 +2142,7 @@ static int __add_preferred_console(char *name, int idx, char *options, preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; + c->user_specified = user_specified; braille_set_options(c, brl_options); c->index = idx; @@ -2194,7 +2197,7 @@ static int __init console_setup(char *str) idx = simple_strtoul(s, NULL, 10); *s = 0; - __add_preferred_console(buf, idx, options, brl_options); + __add_preferred_console(buf, idx, options, brl_options, true); console_set_on_cmdline = 1; return 1; } @@ -2215,7 +2218,7 @@ __setup("console=", console_setup); */ int add_preferred_console(char *name, int idx, char *options) { - return __add_preferred_console(name, idx, options, NULL); + return __add_preferred_console(name, idx, options, NULL, false); } bool console_suspend_enabled = true; @@ -2636,7 +2639,7 @@ early_param("keep_bootcon", keep_bootcon_setup); * Care need to be taken with consoles that are statically * enabled such as netconsole */ -static int try_enable_new_console(struct console *newcon) +static int try_enable_new_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i; @@ -2644,6 +2647,8 @@ static int try_enable_new_console(struct console *newcon) for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { + if (c->user_specified != user_specified) + continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ @@ -2673,9 +2678,10 @@ static int try_enable_new_console(struct console *newcon) /* * Some consoles, such as pstore and netconsole, can be enabled even - * without matching. + * without matching. Accept the pre-enabled consoles only when match() + * and setup() had a change to be called. */ - if (newcon->flags & CON_ENABLED) + if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; @@ -2752,11 +2758,12 @@ void register_console(struct console *newcon) } } - /* - * See if this console matches one we selected on - * the command line or if it was statically enabled - */ - err = try_enable_new_console(newcon); + /* See if this console matches one we selected on the command line */ + err = try_enable_new_console(newcon, true); + + /* If not, try to match against the platform default(s) */ + if (err == -ENOENT) + err = try_enable_new_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) -- cgit v1.2.3 From 33225d7b0ac9903c5701bbede7dfdaeba74ad6c3 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 13 Feb 2020 10:51:33 +0100 Subject: printk: Correctly set CON_CONSDEV even when preferred console was not registered CON_CONSDEV flag was historically used to put/keep the preferred console first in console_drivers list. Where the preferred console is the last on the command line. The ordering is important only when opening /dev/console: + tty_kopen() + tty_lookup_driver() + console_device() The flag was originally an implementation detail. But it was later made accessible from userspace via /proc/consoles. It was used, for example, by the tool "showconsole" to show the real tty accessible via /dev/console, see https://github.com/bitstreamout/showconsole Now, the current code sets CON_CONSDEV only for the preferred console or when a fallback console is added. The flag is not set when the preferred console is defined on the command line but it is not registered from some reasons. Simple solution is to set CON_CONSDEV flag for the first registered console. It will work most of the time because: + Most real consoles have console->device defined. + Boot consoles are removed in printk_late_init(). + unregister_console() moves CON_CONSDEV flag to the next console. Clean solution would require checking con->device when the preferred console is registered and in unregister_console(). Conclusion: Use the simple solution for now. It is better than the current state and good enough. The clean solution is not worth it. It would complicate the already complicated code without too much gain. Instead the code would deserve a complete rewrite. Link: https://lore.kernel.org/r/20200213095133.23176-4-pmladek@suse.com Signed-off-by: Benjamin Herrenschmidt [pmladek@suse.com: Correct reasoning in the commit message, comment update.] Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f76ef3f0efca..cf0ceacdae2f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2788,6 +2788,8 @@ void register_console(struct console *newcon) console_drivers = newcon; if (newcon->next) newcon->next->flags &= ~CON_CONSDEV; + /* Ensure this flag is always set for the head of the list */ + newcon->flags |= CON_CONSDEV; } else { newcon->next = console_drivers->next; console_drivers->next = newcon; -- cgit v1.2.3 From 5c361425744d1e3b03d835dde659708683ca27d1 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 7 Jan 2020 17:31:04 +0100 Subject: kcsan: Prefer __always_inline for fast-path Prefer __always_inline for fast-path functions that are called outside of user_access_save, to avoid generating UACCESS warnings when optimizing for size (CC_OPTIMIZE_FOR_SIZE). It will also avoid future surprises with compiler versions that change the inlining heuristic even when optimizing for performance. Reported-by: Randy Dunlap Acked-by: Randy Dunlap # build-tested Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/58708908-84a0-0a81-a836-ad97e33dbb62@infradead.org --- kernel/kcsan/atomic.h | 2 +- kernel/kcsan/core.c | 18 +++++++++--------- kernel/kcsan/encoding.h | 14 +++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index 576e03ddd6a3..a9c193053491 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -18,7 +18,7 @@ * than cast to volatile. Eventually, we hope to be able to remove this * function. */ -static inline bool kcsan_is_atomic(const volatile void *ptr) +static __always_inline bool kcsan_is_atomic(const volatile void *ptr) { /* only jiffies for now */ return ptr == &jiffies; diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3314fc29e236..4d4ab5c5dc53 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -78,10 +78,10 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; */ static DEFINE_PER_CPU(long, kcsan_skip); -static inline atomic_long_t *find_watchpoint(unsigned long addr, - size_t size, - bool expect_write, - long *encoded_watchpoint) +static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, + size_t size, + bool expect_write, + long *encoded_watchpoint) { const int slot = watchpoint_slot(addr); const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK; @@ -146,7 +146,7 @@ insert_watchpoint(unsigned long addr, size_t size, bool is_write) * 2. the thread that set up the watchpoint already removed it; * 3. the watchpoint was removed and then re-used. */ -static inline bool +static __always_inline bool try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) { return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); @@ -160,7 +160,7 @@ static inline bool remove_watchpoint(atomic_long_t *watchpoint) return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; } -static inline struct kcsan_ctx *get_ctx(void) +static __always_inline struct kcsan_ctx *get_ctx(void) { /* * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would @@ -169,7 +169,7 @@ static inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } -static inline bool is_atomic(const volatile void *ptr) +static __always_inline bool is_atomic(const volatile void *ptr) { struct kcsan_ctx *ctx = get_ctx(); @@ -193,7 +193,7 @@ static inline bool is_atomic(const volatile void *ptr) return kcsan_is_atomic(ptr); } -static inline bool should_watch(const volatile void *ptr, int type) +static __always_inline bool should_watch(const volatile void *ptr, int type) { /* * Never set up watchpoints when memory operations are atomic. @@ -226,7 +226,7 @@ static inline void reset_kcsan_skip(void) this_cpu_write(kcsan_skip, skip_count); } -static inline bool kcsan_is_enabled(void) +static __always_inline bool kcsan_is_enabled(void) { return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index b63890e86449..f03562aaf2eb 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -59,10 +59,10 @@ encode_watchpoint(unsigned long addr, size_t size, bool is_write) (addr & WATCHPOINT_ADDR_MASK)); } -static inline bool decode_watchpoint(long watchpoint, - unsigned long *addr_masked, - size_t *size, - bool *is_write) +static __always_inline bool decode_watchpoint(long watchpoint, + unsigned long *addr_masked, + size_t *size, + bool *is_write) { if (watchpoint == INVALID_WATCHPOINT || watchpoint == CONSUMED_WATCHPOINT) @@ -78,13 +78,13 @@ static inline bool decode_watchpoint(long watchpoint, /* * Return watchpoint slot for an address. */ -static inline int watchpoint_slot(unsigned long addr) +static __always_inline int watchpoint_slot(unsigned long addr) { return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS; } -static inline bool matching_access(unsigned long addr1, size_t size1, - unsigned long addr2, size_t size2) +static __always_inline bool matching_access(unsigned long addr1, size_t size1, + unsigned long addr2, size_t size2) { unsigned long end_range1 = addr1 + size1 - 1; unsigned long end_range2 = addr2 + size2 - 1; -- cgit v1.2.3 From 47144eca282189afcf34ef25aee8408c168765d4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 10 Jan 2020 19:48:33 +0100 Subject: kcsan: Show full access type in report This commit adds access-type information to KCSAN's reports as follows: "read", "read (marked)", "write", and "write (marked)". Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 15 ++++++++------- kernel/kcsan/kcsan.h | 2 +- kernel/kcsan/report.c | 43 ++++++++++++++++++++++++++++--------------- 3 files changed, 37 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4d4ab5c5dc53..87bf857c8893 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -255,7 +255,7 @@ static inline unsigned int get_delay(void) static noinline void kcsan_found_watchpoint(const volatile void *ptr, size_t size, - bool is_write, + int type, atomic_long_t *watchpoint, long encoded_watchpoint) { @@ -276,7 +276,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { - kcsan_report(ptr, size, is_write, true, raw_smp_processor_id(), + kcsan_report(ptr, size, type, true, raw_smp_processor_id(), KCSAN_REPORT_CONSUMED_WATCHPOINT); } else { /* @@ -292,8 +292,9 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, } static noinline void -kcsan_setup_watchpoint(const volatile void *ptr, size_t size, bool is_write) +kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) { + const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; atomic_long_t *watchpoint; union { u8 _1; @@ -415,13 +416,13 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, bool is_write) * No need to increment 'data_races' counter, as the racing * thread already did. */ - kcsan_report(ptr, size, is_write, size > 8 || value_change, + kcsan_report(ptr, size, type, size > 8 || value_change, smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); } else if (value_change) { /* Inferring a race, since the value should not have changed. */ kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) - kcsan_report(ptr, size, is_write, true, + kcsan_report(ptr, size, type, true, smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); } @@ -455,10 +456,10 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, */ if (unlikely(watchpoint != NULL)) - kcsan_found_watchpoint(ptr, size, is_write, watchpoint, + kcsan_found_watchpoint(ptr, size, type, watchpoint, encoded_watchpoint); else if (unlikely(should_watch(ptr, type))) - kcsan_setup_watchpoint(ptr, size, is_write); + kcsan_setup_watchpoint(ptr, size, type); } /* === Public interface ===================================================== */ diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index d3b9a96ac8a4..8492da45494b 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -103,7 +103,7 @@ enum kcsan_report_type { /* * Print a race report from thread that encountered the race. */ -extern void kcsan_report(const volatile void *ptr, size_t size, bool is_write, +extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, bool value_change, int cpu_id, enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 0eea05a3135b..9f503ca2ff7a 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -24,7 +24,7 @@ static struct { const volatile void *ptr; size_t size; - bool is_write; + int access_type; int task_pid; int cpu_id; unsigned long stack_entries[NUM_STACK_ENTRIES]; @@ -41,8 +41,10 @@ static DEFINE_SPINLOCK(report_lock); * Special rules to skip reporting. */ static bool -skip_report(bool is_write, bool value_change, unsigned long top_frame) +skip_report(int access_type, bool value_change, unsigned long top_frame) { + const bool is_write = (access_type & KCSAN_ACCESS_WRITE) != 0; + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && !value_change) { /* @@ -63,9 +65,20 @@ skip_report(bool is_write, bool value_change, unsigned long top_frame) return kcsan_skip_report_debugfs(top_frame); } -static inline const char *get_access_type(bool is_write) +static const char *get_access_type(int type) { - return is_write ? "write" : "read"; + switch (type) { + case 0: + return "read"; + case KCSAN_ACCESS_ATOMIC: + return "read (marked)"; + case KCSAN_ACCESS_WRITE: + return "write"; + case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "write (marked)"; + default: + BUG(); + } } /* Return thread description: in task or interrupt. */ @@ -112,7 +125,7 @@ static int sym_strcmp(void *addr1, void *addr2) /* * Returns true if a report was generated, false otherwise. */ -static bool print_report(const volatile void *ptr, size_t size, bool is_write, +static bool print_report(const volatile void *ptr, size_t size, int access_type, bool value_change, int cpu_id, enum kcsan_report_type type) { @@ -124,7 +137,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, /* * Must check report filter rules before starting to print. */ - if (skip_report(is_write, true, stack_entries[skipnr])) + if (skip_report(access_type, true, stack_entries[skipnr])) return false; if (type == KCSAN_REPORT_RACE_SIGNAL) { @@ -132,7 +145,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, other_info.num_stack_entries); /* @value_change is only known for the other thread */ - if (skip_report(other_info.is_write, value_change, + if (skip_report(other_info.access_type, value_change, other_info.stack_entries[other_skipnr])) return false; } @@ -170,7 +183,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, switch (type) { case KCSAN_REPORT_RACE_SIGNAL: pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(other_info.is_write), other_info.ptr, + get_access_type(other_info.access_type), other_info.ptr, other_info.size, get_thread_desc(other_info.task_pid), other_info.cpu_id); @@ -181,14 +194,14 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, pr_err("\n"); pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(is_write), ptr, size, + get_access_type(access_type), ptr, size, get_thread_desc(in_task() ? task_pid_nr(current) : -1), cpu_id); break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(is_write), ptr, size, + get_access_type(access_type), ptr, size, get_thread_desc(in_task() ? task_pid_nr(current) : -1), cpu_id); break; @@ -223,7 +236,7 @@ static void release_report(unsigned long *flags, enum kcsan_report_type type) * required for the report type, simply acquires report_lock and returns true. */ static bool prepare_report(unsigned long *flags, const volatile void *ptr, - size_t size, bool is_write, int cpu_id, + size_t size, int access_type, int cpu_id, enum kcsan_report_type type) { if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && @@ -243,7 +256,7 @@ retry: other_info.ptr = ptr; other_info.size = size; - other_info.is_write = is_write; + other_info.access_type = access_type; other_info.task_pid = in_task() ? task_pid_nr(current) : -1; other_info.cpu_id = cpu_id; other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); @@ -302,14 +315,14 @@ retry: goto retry; } -void kcsan_report(const volatile void *ptr, size_t size, bool is_write, +void kcsan_report(const volatile void *ptr, size_t size, int access_type, bool value_change, int cpu_id, enum kcsan_report_type type) { unsigned long flags = 0; kcsan_disable_current(); - if (prepare_report(&flags, ptr, size, is_write, cpu_id, type)) { - if (print_report(ptr, size, is_write, value_change, cpu_id, type) && panic_on_warn) + if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { + if (print_report(ptr, size, access_type, value_change, cpu_id, type) && panic_on_warn) panic("panic_on_warn set ...\n"); release_report(&flags, type); -- cgit v1.2.3 From 05f9a4067964e3f864210271a6299f13d2eeea55 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 10 Jan 2020 19:48:34 +0100 Subject: kcsan: Rate-limit reporting per data races KCSAN data-race reports can occur quite frequently, so much so as to render the system useless. This commit therefore adds support for time-based rate-limiting KCSAN reports, with the time interval specified by a new KCSAN_REPORT_ONCE_IN_MS Kconfig option. The default is 3000 milliseconds, also known as three seconds. Because KCSAN must detect data races in allocators and in other contexts where use of allocation is ill-advised, a fixed-size array is used to buffer reports during each reporting interval. To reduce the number of reports lost due to array overflow, this commit stores only one instance of duplicate reports, which has the benefit of further reducing KCSAN's console output rate. Reported-by: Qian Cai Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/report.c | 110 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 9f503ca2ff7a..b5b4feea49de 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -31,12 +32,99 @@ static struct { int num_stack_entries; } other_info = { .ptr = NULL }; +/* + * Information about reported data races; used to rate limit reporting. + */ +struct report_time { + /* + * The last time the data race was reported. + */ + unsigned long time; + + /* + * The frames of the 2 threads; if only 1 thread is known, one frame + * will be 0. + */ + unsigned long frame1; + unsigned long frame2; +}; + +/* + * Since we also want to be able to debug allocators with KCSAN, to avoid + * deadlock, report_times cannot be dynamically resized with krealloc in + * rate_limit_report. + * + * Therefore, we use a fixed-size array, which at most will occupy a page. This + * still adequately rate limits reports, assuming that a) number of unique data + * races is not excessive, and b) occurrence of unique data races within the + * same time window is limited. + */ +#define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time)) +#define REPORT_TIMES_SIZE \ + (CONFIG_KCSAN_REPORT_ONCE_IN_MS > REPORT_TIMES_MAX ? \ + REPORT_TIMES_MAX : \ + CONFIG_KCSAN_REPORT_ONCE_IN_MS) +static struct report_time report_times[REPORT_TIMES_SIZE]; + /* * This spinlock protects reporting and other_info, since other_info is usually * required when reporting. */ static DEFINE_SPINLOCK(report_lock); +/* + * Checks if the data race identified by thread frames frame1 and frame2 has + * been reported since (now - KCSAN_REPORT_ONCE_IN_MS). + */ +static bool rate_limit_report(unsigned long frame1, unsigned long frame2) +{ + struct report_time *use_entry = &report_times[0]; + unsigned long invalid_before; + int i; + + BUILD_BUG_ON(CONFIG_KCSAN_REPORT_ONCE_IN_MS != 0 && REPORT_TIMES_SIZE == 0); + + if (CONFIG_KCSAN_REPORT_ONCE_IN_MS == 0) + return false; + + invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS); + + /* Check if a matching data race report exists. */ + for (i = 0; i < REPORT_TIMES_SIZE; ++i) { + struct report_time *rt = &report_times[i]; + + /* + * Must always select an entry for use to store info as we + * cannot resize report_times; at the end of the scan, use_entry + * will be the oldest entry, which ideally also happened before + * KCSAN_REPORT_ONCE_IN_MS ago. + */ + if (time_before(rt->time, use_entry->time)) + use_entry = rt; + + /* + * Initially, no need to check any further as this entry as well + * as following entries have never been used. + */ + if (rt->time == 0) + break; + + /* Check if entry expired. */ + if (time_before(rt->time, invalid_before)) + continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */ + + /* Reported recently, check if data race matches. */ + if ((rt->frame1 == frame1 && rt->frame2 == frame2) || + (rt->frame1 == frame2 && rt->frame2 == frame1)) + return true; + } + + use_entry->time = jiffies; + use_entry->frame1 = frame1; + use_entry->frame2 = frame2; + return false; +} + /* * Special rules to skip reporting. */ @@ -132,7 +220,9 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); - int other_skipnr; + unsigned long this_frame = stack_entries[skipnr]; + unsigned long other_frame = 0; + int other_skipnr = 0; /* silence uninit warnings */ /* * Must check report filter rules before starting to print. @@ -143,34 +233,34 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, if (type == KCSAN_REPORT_RACE_SIGNAL) { other_skipnr = get_stack_skipnr(other_info.stack_entries, other_info.num_stack_entries); + other_frame = other_info.stack_entries[other_skipnr]; /* @value_change is only known for the other thread */ - if (skip_report(other_info.access_type, value_change, - other_info.stack_entries[other_skipnr])) + if (skip_report(other_info.access_type, value_change, other_frame)) return false; } + if (rate_limit_report(this_frame, other_frame)) + return false; + /* Print report header. */ pr_err("==================================================================\n"); switch (type) { case KCSAN_REPORT_RACE_SIGNAL: { - void *this_fn = (void *)stack_entries[skipnr]; - void *other_fn = (void *)other_info.stack_entries[other_skipnr]; int cmp; /* * Order functions lexographically for consistent bug titles. * Do not print offset of functions to keep title short. */ - cmp = sym_strcmp(other_fn, this_fn); + cmp = sym_strcmp((void *)other_frame, (void *)this_frame); pr_err("BUG: KCSAN: data-race in %ps / %ps\n", - cmp < 0 ? other_fn : this_fn, - cmp < 0 ? this_fn : other_fn); + (void *)(cmp < 0 ? other_frame : this_frame), + (void *)(cmp < 0 ? this_frame : other_frame)); } break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: - pr_err("BUG: KCSAN: data-race in %pS\n", - (void *)stack_entries[skipnr]); + pr_err("BUG: KCSAN: data-race in %pS\n", (void *)this_frame); break; default: -- cgit v1.2.3 From f1bc96210c6a6b853b4b2eec808141956e8fbc5d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 15 Jan 2020 17:25:12 +0100 Subject: kcsan: Make KCSAN compatible with lockdep We must avoid any recursion into lockdep if KCSAN is enabled on utilities used by lockdep. One manifestation of this is corruption of lockdep's IRQ trace state (if TRACE_IRQFLAGS), resulting in spurious warnings (see below). This commit fixes this by: 1. Using raw_local_irq{save,restore} in kcsan_setup_watchpoint(). 2. Disabling lockdep in kcsan_report(). Tested with: CONFIG_LOCKDEP=y CONFIG_DEBUG_LOCKDEP=y CONFIG_TRACE_IRQFLAGS=y This fix eliminates spurious warnings such as the following one: WARNING: CPU: 0 PID: 2 at kernel/locking/lockdep.c:4406 check_flags.part.0+0x101/0x220 Modules linked in: CPU: 0 PID: 2 Comm: kthreadd Not tainted 5.5.0-rc1+ #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:check_flags.part.0+0x101/0x220 Call Trace: lock_is_held_type+0x69/0x150 freezer_fork+0x20b/0x370 cgroup_post_fork+0x2c9/0x5c0 copy_process+0x2675/0x3b40 _do_fork+0xbe/0xa30 ? _raw_spin_unlock_irqrestore+0x40/0x50 ? match_held_lock+0x56/0x250 ? kthread_park+0xf0/0xf0 kernel_thread+0xa6/0xd0 ? kthread_park+0xf0/0xf0 kthreadd+0x321/0x3d0 ? kthread_create_on_cpu+0x130/0x130 ret_from_fork+0x3a/0x50 irq event stamp: 64 hardirqs last enabled at (63): [] _raw_spin_unlock_irqrestore+0x40/0x50 hardirqs last disabled at (64): [] kcsan_setup_watchpoint+0x92/0x460 softirqs last enabled at (32): [] fpu__copy+0xe8/0x470 softirqs last disabled at (30): [] fpu__copy+0x69/0x470 Reported-by: Qian Cai Signed-off-by: Marco Elver Acked-by: Alexander Potapenko Tested-by: Qian Cai Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 6 ++++-- kernel/kcsan/report.c | 11 +++++++++++ kernel/locking/Makefile | 3 +++ 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 87bf857c8893..64b30f7716a1 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -336,8 +336,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * CPU-local data accesses), it makes more sense (from a data race * detection point of view) to simply disable preemptions to ensure * as many tasks as possible run on other CPUs. + * + * Use raw versions, to avoid lockdep recursion via IRQ flags tracing. */ - local_irq_save(irq_flags); + raw_local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -429,7 +431,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: - local_irq_restore(irq_flags); + raw_local_irq_restore(irq_flags); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index b5b4feea49de..33bdf8b229b5 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -410,6 +411,14 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, { unsigned long flags = 0; + /* + * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if + * we do not turn off lockdep here; this could happen due to recursion + * into lockdep via KCSAN if we detect a data race in utilities used by + * lockdep. + */ + lockdep_off(); + kcsan_disable_current(); if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { if (print_report(ptr, size, access_type, value_change, cpu_id, type) && panic_on_warn) @@ -418,4 +427,6 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, release_report(&flags, type); } kcsan_enable_current(); + + lockdep_on(); } diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 45452facff3b..6d11cfb9b41f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,6 +5,9 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +KCSAN_SANITIZE_lockdep.o := n + ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) -- cgit v1.2.3 From ad4f8eeca8eaa24afb6059c241a2f4baf86378f3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jan 2020 16:01:02 +0100 Subject: kcsan: Address missing case with KCSAN_REPORT_VALUE_CHANGE_ONLY Even with KCSAN_REPORT_VALUE_CHANGE_ONLY, KCSAN still reports data races between reads and watchpointed writes, even if the writes wrote values already present. This commit causes KCSAN to unconditionally skip reporting in this case. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/report.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 33bdf8b229b5..7cd34285df74 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -130,12 +130,25 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) * Special rules to skip reporting. */ static bool -skip_report(int access_type, bool value_change, unsigned long top_frame) +skip_report(bool value_change, unsigned long top_frame) { - const bool is_write = (access_type & KCSAN_ACCESS_WRITE) != 0; - - if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && - !value_change) { + /* + * The first call to skip_report always has value_change==true, since we + * cannot know the value written of an instrumented access. For the 2nd + * call there are 6 cases with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY: + * + * 1. read watchpoint, conflicting write (value_change==true): report; + * 2. read watchpoint, conflicting write (value_change==false): skip; + * 3. write watchpoint, conflicting write (value_change==true): report; + * 4. write watchpoint, conflicting write (value_change==false): skip; + * 5. write watchpoint, conflicting read (value_change==false): skip; + * 6. write watchpoint, conflicting read (value_change==true): impossible; + * + * Cases 1-4 are intuitive and expected; case 5 ensures we do not report + * data races where the write may have rewritten the same value; and + * case 6 is simply impossible. + */ + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && !value_change) { /* * The access is a write, but the data value did not change. * @@ -228,7 +241,7 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, /* * Must check report filter rules before starting to print. */ - if (skip_report(access_type, true, stack_entries[skipnr])) + if (skip_report(true, stack_entries[skipnr])) return false; if (type == KCSAN_REPORT_RACE_SIGNAL) { @@ -237,7 +250,7 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, other_frame = other_info.stack_entries[other_skipnr]; /* @value_change is only known for the other thread */ - if (skip_report(other_info.access_type, value_change, other_frame)) + if (skip_report(value_change, other_frame)) return false; } -- cgit v1.2.3 From 1e6ee2f0fe8ae682757960edf455e99f611268a0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 Feb 2020 18:21:10 +0100 Subject: kcsan: Add option to assume plain aligned writes up to word size are atomic This adds option KCSAN_ASSUME_PLAIN_WRITES_ATOMIC. If enabled, plain aligned writes up to word size are assumed to be atomic, and also not subject to other unsafe compiler optimizations resulting in data races. This option has been enabled by default to reflect current kernel-wide preferences. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 64b30f7716a1..e3c7d8f34f2f 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -169,10 +170,20 @@ static __always_inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } -static __always_inline bool is_atomic(const volatile void *ptr) +static __always_inline bool +is_atomic(const volatile void *ptr, size_t size, int type) { - struct kcsan_ctx *ctx = get_ctx(); + struct kcsan_ctx *ctx; + + if ((type & KCSAN_ACCESS_ATOMIC) != 0) + return true; + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && + (type & KCSAN_ACCESS_WRITE) != 0 && size <= sizeof(long) && + IS_ALIGNED((unsigned long)ptr, size)) + return true; /* Assume aligned writes up to word size are atomic. */ + + ctx = get_ctx(); if (unlikely(ctx->atomic_next > 0)) { /* * Because we do not have separate contexts for nested @@ -193,7 +204,8 @@ static __always_inline bool is_atomic(const volatile void *ptr) return kcsan_is_atomic(ptr); } -static __always_inline bool should_watch(const volatile void *ptr, int type) +static __always_inline bool +should_watch(const volatile void *ptr, size_t size, int type) { /* * Never set up watchpoints when memory operations are atomic. @@ -202,7 +214,7 @@ static __always_inline bool should_watch(const volatile void *ptr, int type) * should not count towards skipped instructions, and (2) to actually * decrement kcsan_atomic_next for consecutive instruction stream. */ - if ((type & KCSAN_ACCESS_ATOMIC) != 0 || is_atomic(ptr)) + if (is_atomic(ptr, size, type)) return false; if (this_cpu_dec_return(kcsan_skip) >= 0) @@ -460,7 +472,7 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, if (unlikely(watchpoint != NULL)) kcsan_found_watchpoint(ptr, size, type, watchpoint, encoded_watchpoint); - else if (unlikely(should_watch(ptr, type))) + else if (unlikely(should_watch(ptr, size, type))) kcsan_setup_watchpoint(ptr, size, type); } -- cgit v1.2.3 From ed95f95c86cd53621103d865d62b5e1f96e60edb Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 5 Feb 2020 11:14:19 +0100 Subject: kcsan: Fix 0-sized checks Instrumentation of arbitrary memory-copy functions, such as user-copies, may be called with size of 0, which could lead to false positives. To avoid this, add a comparison in check_access() for size==0, which will be optimized out for constant sized instrumentation (__tsan_{read,write}N), and therefore not affect the common-case fast-path. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 7 +++++++ kernel/kcsan/test.c | 10 ++++++++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index e3c7d8f34f2f..82c2bef827d4 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -455,6 +455,13 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, atomic_long_t *watchpoint; long encoded_watchpoint; + /* + * Do nothing for 0 sized check; this comparison will be optimized out + * for constant sized instrumentation (__tsan_{read,write}N). + */ + if (unlikely(size == 0)) + return; + /* * Avoid user_access_save in fast-path: find_watchpoint is safe without * user_access_save, as the address that ptr points to is only used to diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c index cc6000239dc0..d26a052d3383 100644 --- a/kernel/kcsan/test.c +++ b/kernel/kcsan/test.c @@ -92,6 +92,16 @@ static bool test_matching_access(void) return false; if (WARN_ON(matching_access(9, 1, 10, 1))) return false; + + /* + * An access of size 0 could match another access, as demonstrated here. + * Rather than add more comparisons to 'matching_access()', which would + * end up in the fast-path for *all* checks, check_access() simply + * returns for all accesses of size 0. + */ + if (WARN_ON(!matching_access(8, 8, 12, 0))) + return false; + return true; } -- cgit v1.2.3 From d591ec3db75f9eadfa7976ff8796c674c0027715 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 6 Feb 2020 16:46:24 +0100 Subject: kcsan: Introduce KCSAN_ACCESS_ASSERT access type The KCSAN_ACCESS_ASSERT access type may be used to introduce dummy reads and writes to assert certain properties of concurrent code, where bugs could not be detected as normal data races. For example, a variable that is only meant to be written by a single CPU, but may be read (without locking) by other CPUs must still be marked properly to avoid data races. However, concurrent writes, regardless if WRITE_ONCE() or not, would be a bug. Using kcsan_check_access(&x, sizeof(x), KCSAN_ACCESS_ASSERT) would allow catching such bugs. To support KCSAN_ACCESS_ASSERT the following notable changes were made: * If an access is of type KCSAN_ASSERT_ACCESS, disable various filters that only apply to data races, so that all races that KCSAN observes are reported. * Bug reports that involve an ASSERT access type will be reported as "KCSAN: assert: race in ..." instead of "data-race"; this will help more easily distinguish them. * Update a few comments to just mention 'races' where we do not always mean pure data races. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 44 ++++++++++++++++++++++++++++++++++++++------ kernel/kcsan/debugfs.c | 1 + kernel/kcsan/kcsan.h | 7 +++++++ kernel/kcsan/report.c | 43 +++++++++++++++++++++++++++++++------------ 4 files changed, 77 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 82c2bef827d4..87ef01e40199 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -56,7 +56,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { /* * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary - * slot (middle) is fine if we assume that data races occur rarely. The set of + * slot (middle) is fine if we assume that races occur rarely. The set of * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. */ @@ -178,6 +178,14 @@ is_atomic(const volatile void *ptr, size_t size, int type) if ((type & KCSAN_ACCESS_ATOMIC) != 0) return true; + /* + * Unless explicitly declared atomic, never consider an assertion access + * as atomic. This allows using them also in atomic regions, such as + * seqlocks, without implicitly changing their semantics. + */ + if ((type & KCSAN_ACCESS_ASSERT) != 0) + return false; + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && (type & KCSAN_ACCESS_WRITE) != 0 && size <= sizeof(long) && IS_ALIGNED((unsigned long)ptr, size)) @@ -298,7 +306,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, */ kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); } - kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + + if ((type & KCSAN_ACCESS_ASSERT) != 0) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + else + kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); user_access_restore(flags); } @@ -307,6 +319,7 @@ static noinline void kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) { const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; + const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; atomic_long_t *watchpoint; union { u8 _1; @@ -429,13 +442,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) /* * No need to increment 'data_races' counter, as the racing * thread already did. + * + * Count 'assert_failures' for each failed ASSERT access, + * therefore both this thread and the racing thread may + * increment this counter. */ - kcsan_report(ptr, size, type, size > 8 || value_change, - smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); + if (is_assert) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + + /* + * - If we were not able to observe a value change due to size + * constraints, always assume a value change. + * - If the access type is an assertion, we also always assume a + * value change to always report the race. + */ + value_change = value_change || size > 8 || is_assert; + + kcsan_report(ptr, size, type, value_change, smp_processor_id(), + KCSAN_REPORT_RACE_SIGNAL); } else if (value_change) { /* Inferring a race, since the value should not have changed. */ + kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); - if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + if (is_assert) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, true, smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); @@ -471,7 +503,7 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, &encoded_watchpoint); /* * It is safe to check kcsan_is_enabled() after find_watchpoint in the - * slow-path, as long as no state changes that cause a data race to be + * slow-path, as long as no state changes that cause a race to be * detected and reported have occurred until kcsan_is_enabled() is * checked. */ diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index bec42dab32ee..a9dad44130e6 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -44,6 +44,7 @@ static const char *counter_to_name(enum kcsan_counter_id id) case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; case KCSAN_COUNTER_DATA_RACES: return "data_races"; + case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures"; case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; case KCSAN_COUNTER_REPORT_RACES: return "report_races"; case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 8492da45494b..50078e7d43c3 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -39,6 +39,13 @@ enum kcsan_counter_id { */ KCSAN_COUNTER_DATA_RACES, + /* + * Total number of ASSERT failures due to races. If the observed race is + * due to two conflicting ASSERT type accesses, then both will be + * counted. + */ + KCSAN_COUNTER_ASSERT_FAILURES, + /* * Number of times no watchpoints were available. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 7cd34285df74..3bc590e6be7e 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -34,11 +34,11 @@ static struct { } other_info = { .ptr = NULL }; /* - * Information about reported data races; used to rate limit reporting. + * Information about reported races; used to rate limit reporting. */ struct report_time { /* - * The last time the data race was reported. + * The last time the race was reported. */ unsigned long time; @@ -57,7 +57,7 @@ struct report_time { * * Therefore, we use a fixed-size array, which at most will occupy a page. This * still adequately rate limits reports, assuming that a) number of unique data - * races is not excessive, and b) occurrence of unique data races within the + * races is not excessive, and b) occurrence of unique races within the * same time window is limited. */ #define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time)) @@ -74,7 +74,7 @@ static struct report_time report_times[REPORT_TIMES_SIZE]; static DEFINE_SPINLOCK(report_lock); /* - * Checks if the data race identified by thread frames frame1 and frame2 has + * Checks if the race identified by thread frames frame1 and frame2 has * been reported since (now - KCSAN_REPORT_ONCE_IN_MS). */ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) @@ -90,7 +90,7 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS); - /* Check if a matching data race report exists. */ + /* Check if a matching race report exists. */ for (i = 0; i < REPORT_TIMES_SIZE; ++i) { struct report_time *rt = &report_times[i]; @@ -114,7 +114,7 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) if (time_before(rt->time, invalid_before)) continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */ - /* Reported recently, check if data race matches. */ + /* Reported recently, check if race matches. */ if ((rt->frame1 == frame1 && rt->frame2 == frame2) || (rt->frame1 == frame2 && rt->frame2 == frame1)) return true; @@ -142,11 +142,12 @@ skip_report(bool value_change, unsigned long top_frame) * 3. write watchpoint, conflicting write (value_change==true): report; * 4. write watchpoint, conflicting write (value_change==false): skip; * 5. write watchpoint, conflicting read (value_change==false): skip; - * 6. write watchpoint, conflicting read (value_change==true): impossible; + * 6. write watchpoint, conflicting read (value_change==true): report; * * Cases 1-4 are intuitive and expected; case 5 ensures we do not report - * data races where the write may have rewritten the same value; and - * case 6 is simply impossible. + * data races where the write may have rewritten the same value; case 6 + * is possible either if the size is larger than what we check value + * changes for or the access type is KCSAN_ACCESS_ASSERT. */ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && !value_change) { /* @@ -178,11 +179,27 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; + + /* + * ASSERT variants: + */ + case KCSAN_ACCESS_ASSERT: + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_ATOMIC: + return "assert no writes"; + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE: + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "assert no accesses"; + default: BUG(); } } +static const char *get_bug_type(int type) +{ + return (type & KCSAN_ACCESS_ASSERT) != 0 ? "assert: race" : "data-race"; +} + /* Return thread description: in task or interrupt. */ static const char *get_thread_desc(int task_id) { @@ -268,13 +285,15 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, * Do not print offset of functions to keep title short. */ cmp = sym_strcmp((void *)other_frame, (void *)this_frame); - pr_err("BUG: KCSAN: data-race in %ps / %ps\n", + pr_err("BUG: KCSAN: %s in %ps / %ps\n", + get_bug_type(access_type | other_info.access_type), (void *)(cmp < 0 ? other_frame : this_frame), (void *)(cmp < 0 ? this_frame : other_frame)); } break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: - pr_err("BUG: KCSAN: data-race in %pS\n", (void *)this_frame); + pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(access_type), + (void *)this_frame); break; default: @@ -427,7 +446,7 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, /* * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if * we do not turn off lockdep here; this could happen due to recursion - * into lockdep via KCSAN if we detect a data race in utilities used by + * into lockdep via KCSAN if we detect a race in utilities used by * lockdep. */ lockdep_off(); -- cgit v1.2.3 From a312013578e4775003689e31c1f487df11f362a3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 6 Feb 2020 16:46:26 +0100 Subject: kcsan: Add test to generate conflicts via debugfs Add 'test=' option to KCSAN's debugfs interface to invoke KCSAN checks on a dummy variable. By writing 'test=' to the debugfs file from multiple tasks, we can generate real conflicts, and trigger data race reports. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/debugfs.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index a9dad44130e6..9bbba0e57c9b 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -69,9 +70,9 @@ void kcsan_counter_dec(enum kcsan_counter_id id) /* * The microbenchmark allows benchmarking KCSAN core runtime only. To run * multiple threads, pipe 'microbench=' from multiple tasks into the - * debugfs file. + * debugfs file. This will not generate any conflicts, and tests fast-path only. */ -static void microbenchmark(unsigned long iters) +static noinline void microbenchmark(unsigned long iters) { cycles_t cycles; @@ -81,18 +82,52 @@ static void microbenchmark(unsigned long iters) while (iters--) { /* * We can run this benchmark from multiple tasks; this address - * calculation increases likelyhood of some accesses overlapping - * (they still won't conflict because all are reads). + * calculation increases likelyhood of some accesses + * overlapping. Make the access type an atomic read, to never + * set up watchpoints and test the fast-path only. */ unsigned long addr = iters % (CONFIG_KCSAN_NUM_WATCHPOINTS * PAGE_SIZE); - __kcsan_check_read((void *)addr, sizeof(long)); + __kcsan_check_access((void *)addr, sizeof(long), KCSAN_ACCESS_ATOMIC); } cycles = get_cycles() - cycles; pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); } +/* + * Simple test to create conflicting accesses. Write 'test=' to KCSAN's + * debugfs file from multiple tasks to generate real conflicts and show reports. + */ +static long test_dummy; +static noinline void test_thread(unsigned long iters) +{ + const struct kcsan_ctx ctx_save = current->kcsan_ctx; + cycles_t cycles; + + /* We may have been called from an atomic region; reset context. */ + memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); + + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + + cycles = get_cycles(); + while (iters--) { + __kcsan_check_read(&test_dummy, sizeof(test_dummy)); + __kcsan_check_write(&test_dummy, sizeof(test_dummy)); + ASSERT_EXCLUSIVE_WRITER(test_dummy); + ASSERT_EXCLUSIVE_ACCESS(test_dummy); + + /* not actually instrumented */ + WRITE_ONCE(test_dummy, iters); /* to observe value-change */ + } + cycles = get_cycles() - cycles; + + pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + + /* restore context */ + current->kcsan_ctx = ctx_save; +} + static int cmp_filterlist_addrs(const void *rhs, const void *lhs) { const unsigned long a = *(const unsigned long *)rhs; @@ -242,6 +277,12 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) return -EINVAL; microbenchmark(iters); + } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) { + unsigned long iters; + + if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters)) + return -EINVAL; + test_thread(iters); } else if (!strcmp(arg, "whitelist")) { set_report_filterlist_whitelist(true); } else if (!strcmp(arg, "blacklist")) { -- cgit v1.2.3 From 80d4c4775216602ccdc9e761ce251c8451d0c6ca Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 7 Feb 2020 19:59:10 +0100 Subject: kcsan: Expose core configuration parameters as module params This adds early_boot, udelay_{task,interrupt}, and skip_watch as module params. The latter parameters are useful to modify at runtime to tune KCSAN's performance on new systems. This will also permit auto-tuning these parameters to maximize overall system performance and KCSAN's race detection ability. None of the parameters are used in the fast-path and referring to them via static variables instead of CONFIG constants will not affect performance. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Cc: Qian Cai --- kernel/kcsan/core.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 87ef01e40199..498b1eb3c1cd 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,20 @@ #include "encoding.h" #include "kcsan.h" +static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); +static unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; +static unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; +static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kcsan." +module_param_named(early_enable, kcsan_early_enable, bool, 0); +module_param_named(udelay_task, kcsan_udelay_task, uint, 0644); +module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644); +module_param_named(skip_watch, kcsan_skip_watch, long, 0644); + bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ @@ -239,9 +254,9 @@ should_watch(const volatile void *ptr, size_t size, int type) static inline void reset_kcsan_skip(void) { - long skip_count = CONFIG_KCSAN_SKIP_WATCH - + long skip_count = kcsan_skip_watch - (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? - prandom_u32_max(CONFIG_KCSAN_SKIP_WATCH) : + prandom_u32_max(kcsan_skip_watch) : 0); this_cpu_write(kcsan_skip, skip_count); } @@ -253,8 +268,7 @@ static __always_inline bool kcsan_is_enabled(void) static inline unsigned int get_delay(void) { - unsigned int delay = in_task() ? CONFIG_KCSAN_UDELAY_TASK : - CONFIG_KCSAN_UDELAY_INTERRUPT; + unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? prandom_u32_max(delay) : 0); @@ -527,7 +541,7 @@ void __init kcsan_init(void) * We are in the init task, and no other tasks should be running; * WRITE_ONCE without memory barrier is sufficient. */ - if (IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE)) + if (kcsan_early_enable) WRITE_ONCE(kcsan_enabled, true); } -- cgit v1.2.3 From 3a5b45e5031fa395ab6e53545fb726b2d5b104f0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 10 Feb 2020 15:56:39 +0100 Subject: kcsan: Fix misreporting if concurrent races on same address If there are at least 4 threads racing on the same address, it can happen that one of the readers may observe another matching reader in other_info. To avoid locking up, we have to consume 'other_info' regardless, but skip the report. See the added comment for more details. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/report.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 3bc590e6be7e..abf6852dff72 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -422,6 +422,44 @@ retry: return false; } + access_type |= other_info.access_type; + if ((access_type & KCSAN_ACCESS_WRITE) == 0) { + /* + * While the address matches, this is not the other_info + * from the thread that consumed our watchpoint, since + * neither this nor the access in other_info is a write. + * It is invalid to continue with the report, since we + * only have information about reads. + * + * This can happen due to concurrent races on the same + * address, with at least 4 threads. To avoid locking up + * other_info and all other threads, we have to consume + * it regardless. + * + * A concrete case to illustrate why we might lock up if + * we do not consume other_info: + * + * We have 4 threads, all accessing the same address + * (or matching address ranges). Assume the following + * watcher and watchpoint consumer pairs: + * write1-read1, read2-write2. The first to populate + * other_info is write2, however, write1 consumes it, + * resulting in a report of write1-write2. This report + * is valid, however, now read1 populates other_info; + * read2-read1 is an invalid conflict, yet, no other + * conflicting access is left. Therefore, we must + * consume read1's other_info. + * + * Since this case is assumed to be rare, it is + * reasonable to omit this report: one of the other + * reports includes information about the same shared + * data, and at this point the likelihood that we + * re-report the same race again is high. + */ + release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + return false; + } + /* * Matching & usable access in other_info: keep other_info_lock * locked, as this thread consumes it to print the full report; -- cgit v1.2.3 From b738f6169f1260b4ed5bd9f220b1c84d79f3ab8d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:21 +0100 Subject: kcsan: Introduce kcsan_value_change type Introduces kcsan_value_change type, which explicitly points out if we either observed a value-change (TRUE), or we could not observe one but cannot rule out a value-change happened (MAYBE). The MAYBE state can either be reported or not, depending on configuration preferences. A follow-up patch introduces the FALSE state, which should never be reported. No functional change intended. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 38 ++++++++++++++++++++++---------------- kernel/kcsan/kcsan.h | 19 ++++++++++++++++++- kernel/kcsan/report.c | 26 ++++++++++++++------------ 3 files changed, 54 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 498b1eb3c1cd..3f89801161d3 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -341,7 +341,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) u32 _4; u64 _8; } expect_value; - bool value_change = false; + enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; @@ -398,6 +398,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Read the current value, to later check and infer a race if the data * was modified via a non-instrumented access, e.g. from a device. */ + expect_value._8 = 0; switch (size) { case 1: expect_value._1 = READ_ONCE(*(const u8 *)ptr); @@ -436,23 +437,36 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) */ switch (size) { case 1: - value_change = expect_value._1 != READ_ONCE(*(const u8 *)ptr); + expect_value._1 ^= READ_ONCE(*(const u8 *)ptr); break; case 2: - value_change = expect_value._2 != READ_ONCE(*(const u16 *)ptr); + expect_value._2 ^= READ_ONCE(*(const u16 *)ptr); break; case 4: - value_change = expect_value._4 != READ_ONCE(*(const u32 *)ptr); + expect_value._4 ^= READ_ONCE(*(const u32 *)ptr); break; case 8: - value_change = expect_value._8 != READ_ONCE(*(const u64 *)ptr); + expect_value._8 ^= READ_ONCE(*(const u64 *)ptr); break; default: break; /* ignore; we do not diff the values */ } + /* Were we able to observe a value-change? */ + if (expect_value._8 != 0) + value_change = KCSAN_VALUE_CHANGE_TRUE; + /* Check if this access raced with another. */ if (!remove_watchpoint(watchpoint)) { + /* + * Depending on the access type, map a value_change of MAYBE to + * TRUE (require reporting). + */ + if (value_change == KCSAN_VALUE_CHANGE_MAYBE && (size > 8 || is_assert)) { + /* Always assume a value-change. */ + value_change = KCSAN_VALUE_CHANGE_TRUE; + } + /* * No need to increment 'data_races' counter, as the racing * thread already did. @@ -461,20 +475,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * therefore both this thread and the racing thread may * increment this counter. */ - if (is_assert) + if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); - /* - * - If we were not able to observe a value change due to size - * constraints, always assume a value change. - * - If the access type is an assertion, we also always assume a - * value change to always report the race. - */ - value_change = value_change || size > 8 || is_assert; - kcsan_report(ptr, size, type, value_change, smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); - } else if (value_change) { + } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); @@ -482,7 +488,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) - kcsan_report(ptr, size, type, true, + kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 50078e7d43c3..83a79b08b550 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -88,6 +88,22 @@ extern void kcsan_counter_dec(enum kcsan_counter_id id); */ extern bool kcsan_skip_report_debugfs(unsigned long func_addr); +/* + * Value-change states. + */ +enum kcsan_value_change { + /* + * Did not observe a value-change, however, it is valid to report the + * race, depending on preferences. + */ + KCSAN_VALUE_CHANGE_MAYBE, + + /* + * The value was observed to change, and the race should be reported. + */ + KCSAN_VALUE_CHANGE_TRUE, +}; + enum kcsan_report_type { /* * The thread that set up the watchpoint and briefly stalled was @@ -111,6 +127,7 @@ enum kcsan_report_type { * Print a race report from thread that encountered the race. */ extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, - bool value_change, int cpu_id, enum kcsan_report_type type); + enum kcsan_value_change value_change, int cpu_id, + enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index abf6852dff72..d871476dc134 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -130,26 +130,27 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) * Special rules to skip reporting. */ static bool -skip_report(bool value_change, unsigned long top_frame) +skip_report(enum kcsan_value_change value_change, unsigned long top_frame) { /* - * The first call to skip_report always has value_change==true, since we + * The first call to skip_report always has value_change==TRUE, since we * cannot know the value written of an instrumented access. For the 2nd * call there are 6 cases with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY: * - * 1. read watchpoint, conflicting write (value_change==true): report; - * 2. read watchpoint, conflicting write (value_change==false): skip; - * 3. write watchpoint, conflicting write (value_change==true): report; - * 4. write watchpoint, conflicting write (value_change==false): skip; - * 5. write watchpoint, conflicting read (value_change==false): skip; - * 6. write watchpoint, conflicting read (value_change==true): report; + * 1. read watchpoint, conflicting write (value_change==TRUE): report; + * 2. read watchpoint, conflicting write (value_change==MAYBE): skip; + * 3. write watchpoint, conflicting write (value_change==TRUE): report; + * 4. write watchpoint, conflicting write (value_change==MAYBE): skip; + * 5. write watchpoint, conflicting read (value_change==MAYBE): skip; + * 6. write watchpoint, conflicting read (value_change==TRUE): report; * * Cases 1-4 are intuitive and expected; case 5 ensures we do not report * data races where the write may have rewritten the same value; case 6 * is possible either if the size is larger than what we check value * changes for or the access type is KCSAN_ACCESS_ASSERT. */ - if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && !value_change) { + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && + value_change == KCSAN_VALUE_CHANGE_MAYBE) { /* * The access is a write, but the data value did not change. * @@ -245,7 +246,7 @@ static int sym_strcmp(void *addr1, void *addr2) * Returns true if a report was generated, false otherwise. */ static bool print_report(const volatile void *ptr, size_t size, int access_type, - bool value_change, int cpu_id, + enum kcsan_value_change value_change, int cpu_id, enum kcsan_report_type type) { unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; @@ -258,7 +259,7 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, /* * Must check report filter rules before starting to print. */ - if (skip_report(true, stack_entries[skipnr])) + if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr])) return false; if (type == KCSAN_REPORT_RACE_SIGNAL) { @@ -477,7 +478,8 @@ retry: } void kcsan_report(const volatile void *ptr, size_t size, int access_type, - bool value_change, int cpu_id, enum kcsan_report_type type) + enum kcsan_value_change value_change, int cpu_id, + enum kcsan_report_type type) { unsigned long flags = 0; -- cgit v1.2.3 From 81af89e15862909881ff010a0adb67148487e88a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:22 +0100 Subject: kcsan: Add kcsan_set_access_mask() support When setting up an access mask with kcsan_set_access_mask(), KCSAN will only report races if concurrent changes to bits set in access_mask are observed. Conveying access_mask via a separate call avoids introducing overhead in the common-case fast-path. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 43 +++++++++++++++++++++++++++++++++++++++---- kernel/kcsan/kcsan.h | 5 +++++ kernel/kcsan/report.c | 13 ++++++++++++- 3 files changed, 56 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3f89801161d3..589b1e7f0f25 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -39,6 +39,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { .atomic_next = 0, .atomic_nest_count = 0, .in_flat_atomic = false, + .access_mask = 0, }; /* @@ -298,6 +299,15 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, if (!kcsan_is_enabled()) return; + + /* + * The access_mask check relies on value-change comparison. To avoid + * reporting a race where e.g. the writer set up the watchpoint, but the + * reader has access_mask!=0, we have to ignore the found watchpoint. + */ + if (get_ctx()->access_mask != 0) + return; + /* * Consume the watchpoint as soon as possible, to minimize the chances * of !consumed. Consuming the watchpoint must always be guarded by @@ -341,6 +351,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) u32 _4; u64 _8; } expect_value; + unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; @@ -435,18 +446,27 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Re-read value, and check if it is as expected; if not, we infer a * racy access. */ + access_mask = get_ctx()->access_mask; switch (size) { case 1: expect_value._1 ^= READ_ONCE(*(const u8 *)ptr); + if (access_mask) + expect_value._1 &= (u8)access_mask; break; case 2: expect_value._2 ^= READ_ONCE(*(const u16 *)ptr); + if (access_mask) + expect_value._2 &= (u16)access_mask; break; case 4: expect_value._4 ^= READ_ONCE(*(const u32 *)ptr); + if (access_mask) + expect_value._4 &= (u32)access_mask; break; case 8: expect_value._8 ^= READ_ONCE(*(const u64 *)ptr); + if (access_mask) + expect_value._8 &= (u64)access_mask; break; default: break; /* ignore; we do not diff the values */ @@ -460,11 +480,20 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (!remove_watchpoint(watchpoint)) { /* * Depending on the access type, map a value_change of MAYBE to - * TRUE (require reporting). + * TRUE (always report) or FALSE (never report). */ - if (value_change == KCSAN_VALUE_CHANGE_MAYBE && (size > 8 || is_assert)) { - /* Always assume a value-change. */ - value_change = KCSAN_VALUE_CHANGE_TRUE; + if (value_change == KCSAN_VALUE_CHANGE_MAYBE) { + if (access_mask != 0) { + /* + * For access with access_mask, we require a + * value-change, as it is likely that races on + * ~access_mask bits are expected. + */ + value_change = KCSAN_VALUE_CHANGE_FALSE; + } else if (size > 8 || is_assert) { + /* Always assume a value-change. */ + value_change = KCSAN_VALUE_CHANGE_TRUE; + } } /* @@ -622,6 +651,12 @@ void kcsan_atomic_next(int n) } EXPORT_SYMBOL(kcsan_atomic_next); +void kcsan_set_access_mask(unsigned long mask) +{ + get_ctx()->access_mask = mask; +} +EXPORT_SYMBOL(kcsan_set_access_mask); + void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { check_access(ptr, size, type); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 83a79b08b550..892de5120c1b 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -98,6 +98,11 @@ enum kcsan_value_change { */ KCSAN_VALUE_CHANGE_MAYBE, + /* + * Did not observe a value-change, and it is invalid to report the race. + */ + KCSAN_VALUE_CHANGE_FALSE, + /* * The value was observed to change, and the race should be reported. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index d871476dc134..11c791b886f3 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -132,6 +132,9 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) static bool skip_report(enum kcsan_value_change value_change, unsigned long top_frame) { + /* Should never get here if value_change==FALSE. */ + WARN_ON_ONCE(value_change == KCSAN_VALUE_CHANGE_FALSE); + /* * The first call to skip_report always has value_change==TRUE, since we * cannot know the value written of an instrumented access. For the 2nd @@ -493,7 +496,15 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, kcsan_disable_current(); if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { - if (print_report(ptr, size, access_type, value_change, cpu_id, type) && panic_on_warn) + /* + * Never report if value_change is FALSE, only if we it is + * either TRUE or MAYBE. In case of MAYBE, further filtering may + * be done once we know the full stack trace in print_report(). + */ + bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE && + print_report(ptr, size, access_type, value_change, cpu_id, type); + + if (reported && panic_on_warn) panic("panic_on_warn set ...\n"); release_report(&flags, type); -- cgit v1.2.3 From 703b321501c95c658275fd76775282fe45989641 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:23 +0100 Subject: kcsan: Introduce ASSERT_EXCLUSIVE_BITS(var, mask) This introduces ASSERT_EXCLUSIVE_BITS(var, mask). ASSERT_EXCLUSIVE_BITS(var, mask) will cause KCSAN to assume that the following access is safe w.r.t. data races (however, please see the docbook comment for disclaimer here). For more context on why this was considered necessary, please see: http://lkml.kernel.org/r/1580995070-25139-1-git-send-email-cai@lca.pw In particular, before this patch, data races between reads (that use @mask bits of an access that should not be modified concurrently) and writes (that change ~@mask bits not used by the readers) would have been annotated with "data_race()" (or "READ_ONCE()"). However, doing so would then hide real problems: we would no longer be able to detect harmful races between reads to @mask bits and writes to @mask bits. Therefore, by using ASSERT_EXCLUSIVE_BITS(var, mask), we accomplish: 1. Avoid proliferation of specific macros at the call sites: by including a single mask in the argument list, we can use the same macro in a wide variety of call sites, regardless of how and which bits in a field each call site actually accesses. 2. The existing code does not need to be modified (although READ_ONCE() may still be advisable if we cannot prove that the data race is always safe). 3. We catch bugs where the exclusive bits are modified concurrently. 4. We document properties of the current code. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Cc: David Hildenbrand Cc: Jan Kara Cc: Qian Cai --- kernel/kcsan/debugfs.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 9bbba0e57c9b..2ff196123977 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -100,8 +100,10 @@ static noinline void microbenchmark(unsigned long iters) * debugfs file from multiple tasks to generate real conflicts and show reports. */ static long test_dummy; +static long test_flags; static noinline void test_thread(unsigned long iters) { + const long CHANGE_BITS = 0xff00ff00ff00ff00L; const struct kcsan_ctx ctx_save = current->kcsan_ctx; cycles_t cycles; @@ -109,16 +111,27 @@ static noinline void test_thread(unsigned long iters) memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + pr_info("test_dummy@%px, test_flags@%px\n", &test_dummy, &test_flags); cycles = get_cycles(); while (iters--) { + /* These all should generate reports. */ __kcsan_check_read(&test_dummy, sizeof(test_dummy)); - __kcsan_check_write(&test_dummy, sizeof(test_dummy)); ASSERT_EXCLUSIVE_WRITER(test_dummy); ASSERT_EXCLUSIVE_ACCESS(test_dummy); + ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */ + __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ + + ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */ + __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ + /* not actually instrumented */ WRITE_ONCE(test_dummy, iters); /* to observe value-change */ + __kcsan_check_write(&test_dummy, sizeof(test_dummy)); + + test_flags ^= CHANGE_BITS; /* generate value-change */ + __kcsan_check_write(&test_flags, sizeof(test_flags)); } cycles = get_cycles() - cycles; -- cgit v1.2.3 From f5d2313bd3c540be405c4977a63840cd6d0167b5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 14 Feb 2020 22:10:35 +0100 Subject: kcsan, trace: Make KCSAN compatible with tracing Previously the system would lock up if ftrace was enabled together with KCSAN. This is due to recursion on reporting if the tracer code is instrumented with KCSAN. To avoid this for all types of tracing, disable KCSAN instrumentation for all of kernel/trace. Furthermore, since KCSAN relies on udelay() to introduce delay, we have to disable ftrace for udelay() (currently done for x86) in case KCSAN is used together with lockdep and ftrace. The reason is that it may corrupt lockdep IRQ flags tracing state due to a peculiar case of recursion (details in Makefile comment). Reported-by: Qian Cai Tested-by: Qian Cai Acked-by: Steven Rostedt (VMware) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/Makefile | 2 ++ kernel/trace/Makefile | 3 +++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index df6b7799e492..d4999b38d1be 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -4,6 +4,8 @@ KCOV_INSTRUMENT := n UBSAN_SANITIZE := n CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ $(call cc-option,-fno-stack-protector,) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f9dcd19165fa..6b601d88bf71 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -6,6 +6,9 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) +# Avoid recursion due to instrumentation. +KCSAN_SANITIZE := n + ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE) -- cgit v1.2.3 From 48b1fc190a180d971fb69217c88c7247f4f2ca19 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 21 Feb 2020 23:02:09 +0100 Subject: kcsan: Add option to allow watcher interruptions Add option to allow interrupts while a watchpoint is set up. This can be enabled either via CONFIG_KCSAN_INTERRUPT_WATCHER or via the boot parameter 'kcsan.interrupt_watcher=1'. Note that, currently not all safe per-CPU access primitives and patterns are accounted for, which could result in false positives. For example, asm-generic/percpu.h uses plain operations, which by default are instrumented. On interrupts and subsequent accesses to the same variable, KCSAN would currently report a data race with this option. Therefore, this option should currently remain disabled by default, but may be enabled for specific test scenarios. To avoid new warnings, changes all uses of smp_processor_id() to use the raw version (as already done in kcsan_found_watchpoint()). The exact SMP processor id is for informational purposes in the report, and correctness is not affected. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 589b1e7f0f25..e7387fec6679 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -21,6 +21,7 @@ static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); static unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; static unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH; +static bool kcsan_interrupt_watcher = IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX @@ -30,6 +31,7 @@ module_param_named(early_enable, kcsan_early_enable, bool, 0); module_param_named(udelay_task, kcsan_udelay_task, uint, 0644); module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644); module_param_named(skip_watch, kcsan_skip_watch, long, 0644); +module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444); bool kcsan_enabled; @@ -354,7 +356,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); - unsigned long irq_flags; + unsigned long irq_flags = 0; /* * Always reset kcsan_skip counter in slow-path to avoid underflow; see @@ -370,26 +372,9 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; } - /* - * Disable interrupts & preemptions to avoid another thread on the same - * CPU accessing memory locations for the set up watchpoint; this is to - * avoid reporting races to e.g. CPU-local data. - * - * An alternative would be adding the source CPU to the watchpoint - * encoding, and checking that watchpoint-CPU != this-CPU. There are - * several problems with this: - * 1. we should avoid stealing more bits from the watchpoint encoding - * as it would affect accuracy, as well as increase performance - * overhead in the fast-path; - * 2. if we are preempted, but there *is* a genuine data race, we - * would *not* report it -- since this is the common case (vs. - * CPU-local data accesses), it makes more sense (from a data race - * detection point of view) to simply disable preemptions to ensure - * as many tasks as possible run on other CPUs. - * - * Use raw versions, to avoid lockdep recursion via IRQ flags tracing. - */ - raw_local_irq_save(irq_flags); + if (!kcsan_interrupt_watcher) + /* Use raw to avoid lockdep recursion via IRQ flags tracing. */ + raw_local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -507,7 +492,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); - kcsan_report(ptr, size, type, value_change, smp_processor_id(), + kcsan_report(ptr, size, type, value_change, raw_smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ @@ -518,13 +503,14 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, - smp_processor_id(), + raw_smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); } kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: - raw_local_irq_restore(irq_flags); + if (!kcsan_interrupt_watcher) + raw_local_irq_restore(irq_flags); out: user_access_restore(ua_flags); } -- cgit v1.2.3 From 2402d0eae589a31ee7b1774cb220d84d0f5605b4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Sat, 22 Feb 2020 00:10:27 +0100 Subject: kcsan: Add option for verbose reporting Adds CONFIG_KCSAN_VERBOSE to optionally enable more verbose reports. Currently information about the reporting task's held locks and IRQ trace events are shown, if they are enabled. Signed-off-by: Marco Elver Suggested-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 4 +- kernel/kcsan/kcsan.h | 3 ++ kernel/kcsan/report.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 107 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index e7387fec6679..065615df88ea 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -18,8 +18,8 @@ #include "kcsan.h" static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); -static unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; -static unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; +unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; +unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH; static bool kcsan_interrupt_watcher = IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 892de5120c1b..e282f8b5749e 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -13,6 +13,9 @@ /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 +extern unsigned int kcsan_udelay_task; +extern unsigned int kcsan_udelay_interrupt; + /* * Globally enable and disable KCSAN. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 11c791b886f3..18f9d3bc93a5 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include #include @@ -31,7 +33,26 @@ static struct { int cpu_id; unsigned long stack_entries[NUM_STACK_ENTRIES]; int num_stack_entries; -} other_info = { .ptr = NULL }; + + /* + * Optionally pass @current. Typically we do not need to pass @current + * via @other_info since just @task_pid is sufficient. Passing @current + * has additional overhead. + * + * To safely pass @current, we must either use get_task_struct/ + * put_task_struct, or stall the thread that populated @other_info. + * + * We cannot rely on get_task_struct/put_task_struct in case + * release_report() races with a task being released, and would have to + * free it in release_report(). This may result in deadlock if we want + * to use KCSAN on the allocators. + * + * Since we also want to reliably print held locks for + * CONFIG_KCSAN_VERBOSE, the current implementation stalls the thread + * that populated @other_info until it has been consumed. + */ + struct task_struct *task; +} other_info; /* * Information about reported races; used to rate limit reporting. @@ -245,6 +266,16 @@ static int sym_strcmp(void *addr1, void *addr2) return strncmp(buf1, buf2, sizeof(buf1)); } +static void print_verbose_info(struct task_struct *task) +{ + if (!task) + return; + + pr_err("\n"); + debug_show_held_locks(task); + print_irqtrace_events(task); +} + /* * Returns true if a report was generated, false otherwise. */ @@ -319,6 +350,9 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, other_info.num_stack_entries - other_skipnr, 0); + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + print_verbose_info(other_info.task); + pr_err("\n"); pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", get_access_type(access_type), ptr, size, @@ -340,6 +374,9 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0); + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + print_verbose_info(current); + /* Print report footer. */ pr_err("\n"); pr_err("Reported by Kernel Concurrency Sanitizer on:\n"); @@ -357,6 +394,67 @@ static void release_report(unsigned long *flags, enum kcsan_report_type type) spin_unlock_irqrestore(&report_lock, *flags); } +/* + * Sets @other_info.task and awaits consumption of @other_info. + * + * Precondition: report_lock is held. + * Postcondition: report_lock is held. + */ +static void +set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) +{ + /* + * We may be instrumenting a code-path where current->state is already + * something other than TASK_RUNNING. + */ + const bool is_running = current->state == TASK_RUNNING; + /* + * To avoid deadlock in case we are in an interrupt here and this is a + * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a + * timeout to ensure this works in all contexts. + * + * Await approximately the worst case delay of the reporting thread (if + * we are not interrupted). + */ + int timeout = max(kcsan_udelay_task, kcsan_udelay_interrupt); + + other_info.task = current; + do { + if (is_running) { + /* + * Let lockdep know the real task is sleeping, to print + * the held locks (recall we turned lockdep off, so + * locking/unlocking @report_lock won't be recorded). + */ + set_current_state(TASK_UNINTERRUPTIBLE); + } + spin_unlock_irqrestore(&report_lock, *flags); + /* + * We cannot call schedule() since we also cannot reliably + * determine if sleeping here is permitted -- see in_atomic(). + */ + + udelay(1); + spin_lock_irqsave(&report_lock, *flags); + if (timeout-- < 0) { + /* + * Abort. Reset other_info.task to NULL, since it + * appears the other thread is still going to consume + * it. It will result in no verbose info printed for + * this task. + */ + other_info.task = NULL; + break; + } + /* + * If @ptr nor @current matches, then our information has been + * consumed and we may continue. If not, retry. + */ + } while (other_info.ptr == ptr && other_info.task == current); + if (is_running) + set_current_state(TASK_RUNNING); +} + /* * Depending on the report type either sets other_info and returns false, or * acquires the matching other_info and returns true. If other_info is not @@ -388,6 +486,9 @@ retry: other_info.cpu_id = cpu_id; other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + set_other_info_task_blocking(flags, ptr); + spin_unlock_irqrestore(&report_lock, *flags); /* -- cgit v1.2.3 From 44656d3dc4f0dc20010d054f27397a4a1469fabf Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 25 Feb 2020 15:32:58 +0100 Subject: kcsan: Add current->state to implicitly atomic accesses Add volatile current->state to list of implicitly atomic accesses. This is in preparation to eventually enable KCSAN on kernel/sched (which currently still has KCSAN_SANITIZE := n). Since accesses that match the special check in atomic.h are rare, it makes more sense to move this check to the slow-path, avoiding the additional compare in the fast-path. With the microbenchmark, a speedup of ~6% is measured. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/atomic.h | 21 +++++++-------------- kernel/kcsan/core.c | 22 +++++++++++++++------- kernel/kcsan/debugfs.c | 27 ++++++++++++++++++--------- 3 files changed, 40 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index a9c193053491..be9e625227f3 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -4,24 +4,17 @@ #define _KERNEL_KCSAN_ATOMIC_H #include +#include /* - * Helper that returns true if access to @ptr should be considered an atomic - * access, even though it is not explicitly atomic. - * - * List all volatile globals that have been observed in races, to suppress - * data race reports between accesses to these variables. - * - * For now, we assume that volatile accesses of globals are as strong as atomic - * accesses (READ_ONCE, WRITE_ONCE cast to volatile). The situation is still not - * entirely clear, as on some architectures (Alpha) READ_ONCE/WRITE_ONCE do more - * than cast to volatile. Eventually, we hope to be able to remove this - * function. + * Special rules for certain memory where concurrent conflicting accesses are + * common, however, the current convention is to not mark them; returns true if + * access to @ptr should be considered atomic. Called from slow-path. */ -static __always_inline bool kcsan_is_atomic(const volatile void *ptr) +static bool kcsan_is_atomic_special(const volatile void *ptr) { - /* only jiffies for now */ - return ptr == &jiffies; + /* volatile globals that have been observed in data races. */ + return ptr == &jiffies || ptr == ¤t->state; } #endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 065615df88ea..eb30ecdc8c00 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -188,12 +188,13 @@ static __always_inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } +/* Rules for generic atomic accesses. Called from fast-path. */ static __always_inline bool is_atomic(const volatile void *ptr, size_t size, int type) { struct kcsan_ctx *ctx; - if ((type & KCSAN_ACCESS_ATOMIC) != 0) + if (type & KCSAN_ACCESS_ATOMIC) return true; /* @@ -201,16 +202,16 @@ is_atomic(const volatile void *ptr, size_t size, int type) * as atomic. This allows using them also in atomic regions, such as * seqlocks, without implicitly changing their semantics. */ - if ((type & KCSAN_ACCESS_ASSERT) != 0) + if (type & KCSAN_ACCESS_ASSERT) return false; if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && - (type & KCSAN_ACCESS_WRITE) != 0 && size <= sizeof(long) && + (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) && IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ ctx = get_ctx(); - if (unlikely(ctx->atomic_next > 0)) { + if (ctx->atomic_next > 0) { /* * Because we do not have separate contexts for nested * interrupts, in case atomic_next is set, we simply assume that @@ -224,10 +225,8 @@ is_atomic(const volatile void *ptr, size_t size, int type) --ctx->atomic_next; /* in task, or outer interrupt */ return true; } - if (unlikely(ctx->atomic_nest_count > 0 || ctx->in_flat_atomic)) - return true; - return kcsan_is_atomic(ptr); + return ctx->atomic_nest_count > 0 || ctx->in_flat_atomic; } static __always_inline bool @@ -367,6 +366,15 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (!kcsan_is_enabled()) goto out; + /* + * Special atomic rules: unlikely to be true, so we check them here in + * the slow-path, and not in the fast-path in is_atomic(). Call after + * kcsan_is_enabled(), as we may access memory that is not yet + * initialized during early boot. + */ + if (!is_assert && kcsan_is_atomic_special(ptr)) + goto out; + if (!check_encodable((unsigned long)ptr, size)) { kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); goto out; diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 2ff196123977..72ee188ebc54 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -74,25 +74,34 @@ void kcsan_counter_dec(enum kcsan_counter_id id) */ static noinline void microbenchmark(unsigned long iters) { + const struct kcsan_ctx ctx_save = current->kcsan_ctx; + const bool was_enabled = READ_ONCE(kcsan_enabled); cycles_t cycles; + /* We may have been called from an atomic region; reset context. */ + memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); + /* + * Disable to benchmark fast-path for all accesses, and (expected + * negligible) call into slow-path, but never set up watchpoints. + */ + WRITE_ONCE(kcsan_enabled, false); + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); cycles = get_cycles(); while (iters--) { - /* - * We can run this benchmark from multiple tasks; this address - * calculation increases likelyhood of some accesses - * overlapping. Make the access type an atomic read, to never - * set up watchpoints and test the fast-path only. - */ - unsigned long addr = - iters % (CONFIG_KCSAN_NUM_WATCHPOINTS * PAGE_SIZE); - __kcsan_check_access((void *)addr, sizeof(long), KCSAN_ACCESS_ATOMIC); + unsigned long addr = iters & ((PAGE_SIZE << 8) - 1); + int type = !(iters & 0x7f) ? KCSAN_ACCESS_ATOMIC : + (!(iters & 0xf) ? KCSAN_ACCESS_WRITE : 0); + __kcsan_check_access((void *)addr, sizeof(long), type); } cycles = get_cycles() - cycles; pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + + WRITE_ONCE(kcsan_enabled, was_enabled); + /* restore context */ + current->kcsan_ctx = ctx_save; } /* -- cgit v1.2.3 From e7b34100500733f7e052ce3dee94e6338b86e6bc Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 5 Mar 2020 15:21:07 +0100 Subject: kcsan: Fix a typo in a comment s/slots slots/slots/ Signed-off-by: Qiujun Huang Reviewed-by: Nick Desaulniers [elver: commit message] Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index eb30ecdc8c00..ee8200835b60 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -45,7 +45,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { }; /* - * Helper macros to index into adjacent slots slots, starting from address slot + * Helper macros to index into adjacent slots, starting from address slot * itself, followed by the right and left slots. * * The purpose is 2-fold: -- cgit v1.2.3 From 135c0872d86948046d11d7083e36c930cc43ac93 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 18 Mar 2020 18:38:44 +0100 Subject: kcsan: Introduce report access_info and other_info Improve readability by introducing access_info and other_info structs, and in preparation of the following commit in this series replaces the single instance of other_info with an array of size 1. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 6 +-- kernel/kcsan/kcsan.h | 2 +- kernel/kcsan/report.c | 147 +++++++++++++++++++++++++------------------------- 3 files changed, 77 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index ee8200835b60..f1c38620e3cf 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -321,7 +321,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { - kcsan_report(ptr, size, type, true, raw_smp_processor_id(), + kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, KCSAN_REPORT_CONSUMED_WATCHPOINT); } else { /* @@ -500,8 +500,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); - kcsan_report(ptr, size, type, value_change, raw_smp_processor_id(), - KCSAN_REPORT_RACE_SIGNAL); + kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ @@ -511,7 +510,6 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, - raw_smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index e282f8b5749e..6630dfe32f31 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -135,7 +135,7 @@ enum kcsan_report_type { * Print a race report from thread that encountered the race. */ extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, - enum kcsan_value_change value_change, int cpu_id, + enum kcsan_value_change value_change, enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 18f9d3bc93a5..de234d1c1b3d 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -19,18 +19,23 @@ */ #define NUM_STACK_ENTRIES 64 +/* Common access info. */ +struct access_info { + const volatile void *ptr; + size_t size; + int access_type; + int task_pid; + int cpu_id; +}; + /* * Other thread info: communicated from other racing thread to thread that set * up the watchpoint, which then prints the complete report atomically. Only * need one struct, as all threads should to be serialized regardless to print * the reports, with reporting being in the slow-path. */ -static struct { - const volatile void *ptr; - size_t size; - int access_type; - int task_pid; - int cpu_id; +struct other_info { + struct access_info ai; unsigned long stack_entries[NUM_STACK_ENTRIES]; int num_stack_entries; @@ -52,7 +57,9 @@ static struct { * that populated @other_info until it has been consumed. */ struct task_struct *task; -} other_info; +}; + +static struct other_info other_infos[1]; /* * Information about reported races; used to rate limit reporting. @@ -238,7 +245,7 @@ static const char *get_thread_desc(int task_id) } /* Helper to skip KCSAN-related functions in stack-trace. */ -static int get_stack_skipnr(unsigned long stack_entries[], int num_entries) +static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries) { char buf[64]; int skip = 0; @@ -279,9 +286,10 @@ static void print_verbose_info(struct task_struct *task) /* * Returns true if a report was generated, false otherwise. */ -static bool print_report(const volatile void *ptr, size_t size, int access_type, - enum kcsan_value_change value_change, int cpu_id, - enum kcsan_report_type type) +static bool print_report(enum kcsan_value_change value_change, + enum kcsan_report_type type, + const struct access_info *ai, + const struct other_info *other_info) { unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); @@ -297,9 +305,9 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, return false; if (type == KCSAN_REPORT_RACE_SIGNAL) { - other_skipnr = get_stack_skipnr(other_info.stack_entries, - other_info.num_stack_entries); - other_frame = other_info.stack_entries[other_skipnr]; + other_skipnr = get_stack_skipnr(other_info->stack_entries, + other_info->num_stack_entries); + other_frame = other_info->stack_entries[other_skipnr]; /* @value_change is only known for the other thread */ if (skip_report(value_change, other_frame)) @@ -321,13 +329,13 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, */ cmp = sym_strcmp((void *)other_frame, (void *)this_frame); pr_err("BUG: KCSAN: %s in %ps / %ps\n", - get_bug_type(access_type | other_info.access_type), + get_bug_type(ai->access_type | other_info->ai.access_type), (void *)(cmp < 0 ? other_frame : this_frame), (void *)(cmp < 0 ? this_frame : other_frame)); } break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: - pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(access_type), + pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type), (void *)this_frame); break; @@ -341,30 +349,28 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, switch (type) { case KCSAN_REPORT_RACE_SIGNAL: pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(other_info.access_type), other_info.ptr, - other_info.size, get_thread_desc(other_info.task_pid), - other_info.cpu_id); + get_access_type(other_info->ai.access_type), other_info->ai.ptr, + other_info->ai.size, get_thread_desc(other_info->ai.task_pid), + other_info->ai.cpu_id); /* Print the other thread's stack trace. */ - stack_trace_print(other_info.stack_entries + other_skipnr, - other_info.num_stack_entries - other_skipnr, + stack_trace_print(other_info->stack_entries + other_skipnr, + other_info->num_stack_entries - other_skipnr, 0); if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) - print_verbose_info(other_info.task); + print_verbose_info(other_info->task); pr_err("\n"); pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(access_type), ptr, size, - get_thread_desc(in_task() ? task_pid_nr(current) : -1), - cpu_id); + get_access_type(ai->access_type), ai->ptr, ai->size, + get_thread_desc(ai->task_pid), ai->cpu_id); break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(access_type), ptr, size, - get_thread_desc(in_task() ? task_pid_nr(current) : -1), - cpu_id); + get_access_type(ai->access_type), ai->ptr, ai->size, + get_thread_desc(ai->task_pid), ai->cpu_id); break; default: @@ -386,22 +392,23 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, return true; } -static void release_report(unsigned long *flags, enum kcsan_report_type type) +static void release_report(unsigned long *flags, struct other_info *other_info) { - if (type == KCSAN_REPORT_RACE_SIGNAL) - other_info.ptr = NULL; /* mark for reuse */ + if (other_info) + other_info->ai.ptr = NULL; /* Mark for reuse. */ spin_unlock_irqrestore(&report_lock, *flags); } /* - * Sets @other_info.task and awaits consumption of @other_info. + * Sets @other_info->task and awaits consumption of @other_info. * * Precondition: report_lock is held. * Postcondition: report_lock is held. */ -static void -set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) +static void set_other_info_task_blocking(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) { /* * We may be instrumenting a code-path where current->state is already @@ -418,7 +425,7 @@ set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) */ int timeout = max(kcsan_udelay_task, kcsan_udelay_interrupt); - other_info.task = current; + other_info->task = current; do { if (is_running) { /* @@ -438,19 +445,19 @@ set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) spin_lock_irqsave(&report_lock, *flags); if (timeout-- < 0) { /* - * Abort. Reset other_info.task to NULL, since it + * Abort. Reset @other_info->task to NULL, since it * appears the other thread is still going to consume * it. It will result in no verbose info printed for * this task. */ - other_info.task = NULL; + other_info->task = NULL; break; } /* * If @ptr nor @current matches, then our information has been * consumed and we may continue. If not, retry. */ - } while (other_info.ptr == ptr && other_info.task == current); + } while (other_info->ai.ptr == ai->ptr && other_info->task == current); if (is_running) set_current_state(TASK_RUNNING); } @@ -460,9 +467,8 @@ set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) * acquires the matching other_info and returns true. If other_info is not * required for the report type, simply acquires report_lock and returns true. */ -static bool prepare_report(unsigned long *flags, const volatile void *ptr, - size_t size, int access_type, int cpu_id, - enum kcsan_report_type type) +static bool prepare_report(unsigned long *flags, enum kcsan_report_type type, + const struct access_info *ai, struct other_info *other_info) { if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && type != KCSAN_REPORT_RACE_SIGNAL) { @@ -476,18 +482,14 @@ retry: switch (type) { case KCSAN_REPORT_CONSUMED_WATCHPOINT: - if (other_info.ptr != NULL) + if (other_info->ai.ptr) break; /* still in use, retry */ - other_info.ptr = ptr; - other_info.size = size; - other_info.access_type = access_type; - other_info.task_pid = in_task() ? task_pid_nr(current) : -1; - other_info.cpu_id = cpu_id; - other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); + other_info->ai = *ai; + other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 1); if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) - set_other_info_task_blocking(flags, ptr); + set_other_info_task_blocking(flags, ai, other_info); spin_unlock_irqrestore(&report_lock, *flags); @@ -498,37 +500,31 @@ retry: return false; case KCSAN_REPORT_RACE_SIGNAL: - if (other_info.ptr == NULL) + if (!other_info->ai.ptr) break; /* no data available yet, retry */ /* * First check if this is the other_info we are expecting, i.e. * matches based on how watchpoint was encoded. */ - if (!matching_access((unsigned long)other_info.ptr & - WATCHPOINT_ADDR_MASK, - other_info.size, - (unsigned long)ptr & WATCHPOINT_ADDR_MASK, - size)) + if (!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size, + (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size)) break; /* mismatching watchpoint, retry */ - if (!matching_access((unsigned long)other_info.ptr, - other_info.size, (unsigned long)ptr, - size)) { + if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size, + (unsigned long)ai->ptr, ai->size)) { /* * If the actual accesses to not match, this was a false * positive due to watchpoint encoding. */ - kcsan_counter_inc( - KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); /* discard this other_info */ - release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + release_report(flags, other_info); return false; } - access_type |= other_info.access_type; - if ((access_type & KCSAN_ACCESS_WRITE) == 0) { + if (!((ai->access_type | other_info->ai.access_type) & KCSAN_ACCESS_WRITE)) { /* * While the address matches, this is not the other_info * from the thread that consumed our watchpoint, since @@ -561,15 +557,11 @@ retry: * data, and at this point the likelihood that we * re-report the same race again is high. */ - release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + release_report(flags, other_info); return false; } - /* - * Matching & usable access in other_info: keep other_info_lock - * locked, as this thread consumes it to print the full report; - * unlocked in release_report. - */ + /* Matching access in other_info. */ return true; default: @@ -582,10 +574,19 @@ retry: } void kcsan_report(const volatile void *ptr, size_t size, int access_type, - enum kcsan_value_change value_change, int cpu_id, + enum kcsan_value_change value_change, enum kcsan_report_type type) { unsigned long flags = 0; + const struct access_info ai = { + .ptr = ptr, + .size = size, + .access_type = access_type, + .task_pid = in_task() ? task_pid_nr(current) : -1, + .cpu_id = raw_smp_processor_id() + }; + struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN + ? NULL : &other_infos[0]; /* * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if @@ -596,19 +597,19 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, lockdep_off(); kcsan_disable_current(); - if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { + if (prepare_report(&flags, type, &ai, other_info)) { /* * Never report if value_change is FALSE, only if we it is * either TRUE or MAYBE. In case of MAYBE, further filtering may * be done once we know the full stack trace in print_report(). */ bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE && - print_report(ptr, size, access_type, value_change, cpu_id, type); + print_report(value_change, type, &ai, other_info); if (reported && panic_on_warn) panic("panic_on_warn set ...\n"); - release_report(&flags, type); + release_report(&flags, other_info); } kcsan_enable_current(); -- cgit v1.2.3 From 6119418f94ca5314392d258d27eb0cb58bb1774e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 18 Mar 2020 18:38:45 +0100 Subject: kcsan: Avoid blocking producers in prepare_report() To avoid deadlock in case watchers can be interrupted, we need to ensure that producers of the struct other_info can never be blocked by an unrelated consumer. (Likely to occur with KCSAN_INTERRUPT_WATCHER.) There are several cases that can lead to this scenario, for example: 1. A watchpoint A was set up by task T1, but interrupted by interrupt I1. Some other thread (task or interrupt) finds watchpoint A consumes it, and sets other_info. Then I1 also finds some unrelated watchpoint B, consumes it, but is blocked because other_info is in use. T1 cannot consume other_info because I1 never returns -> deadlock. 2. A watchpoint A was set up by task T1, but interrupted by interrupt I1, which also sets up a watchpoint B. Some other thread finds watchpoint A, and consumes it and sets up other_info with its information. Similarly some other thread finds watchpoint B and consumes it, but is then blocked because other_info is in use. When I1 continues it sees its watchpoint was consumed, and that it must wait for other_info, which currently contains information to be consumed by T1. However, T1 cannot unblock other_info because I1 never returns -> deadlock. To avoid this, we need to ensure that producers of struct other_info always have a usable other_info entry. This is obviously not the case with only a single instance of struct other_info, as concurrent producers must wait for the entry to be released by some consumer (which may be locked up as illustrated above). While it would be nice if producers could simply call kmalloc() and append their instance of struct other_info to a list, we are very limited in this code path: since KCSAN can instrument the allocators themselves, calling kmalloc() could lead to deadlock or corrupted allocator state. Since producers of the struct other_info will always succeed at try_consume_watchpoint(), preceding the call into kcsan_report(), we know that the particular watchpoint slot cannot simply be reused or consumed by another potential other_info producer. If we move removal of a watchpoint after reporting (by the consumer of struct other_info), we can see a consumed watchpoint as a held lock on elements of other_info, if we create a one-to-one mapping of a watchpoint to an other_info element. Therefore, the simplest solution is to create an array of struct other_info that is as large as the watchpoints array in core.c, and pass the watchpoint index to kcsan_report() for producers and consumers, and change watchpoints to be removed after reporting is done. With a default config on a 64-bit system, the array other_infos consumes ~37KiB. For most systems today this is not a problem. On smaller memory constrained systems, the config value CONFIG_KCSAN_NUM_WATCHPOINTS can be reduced appropriately. Overall, this change is a simplification of the prepare_report() code, and makes some of the checks (such as checking if at least one access is a write) redundant. Tested: $ tools/testing/selftests/rcutorture/bin/kvm.sh \ --cpus 12 --duration 10 --kconfig "CONFIG_DEBUG_INFO=y \ CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n \ CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n \ CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y \ CONFIG_KCSAN_INTERRUPT_WATCHER=y CONFIG_PROVE_LOCKING=y" \ --configs TREE03 => No longer hangs and runs to completion as expected. Reported-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 31 +++++--- kernel/kcsan/kcsan.h | 3 +- kernel/kcsan/report.c | 212 ++++++++++++++++++++++++-------------------------- 3 files changed, 124 insertions(+), 122 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index f1c38620e3cf..4d8ea0fca5f1 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -69,7 +69,6 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { * slot=9: [10, 11, 9] * slot=63: [64, 65, 63] */ -#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) #define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) /* @@ -171,12 +170,16 @@ try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); } -/* - * Return true if watchpoint was not touched, false if consumed. - */ -static inline bool remove_watchpoint(atomic_long_t *watchpoint) +/* Return true if watchpoint was not touched, false if already consumed. */ +static inline bool consume_watchpoint(atomic_long_t *watchpoint) { - return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; + return atomic_long_xchg_relaxed(watchpoint, CONSUMED_WATCHPOINT) != CONSUMED_WATCHPOINT; +} + +/* Remove the watchpoint -- its slot may be reused after. */ +static inline void remove_watchpoint(atomic_long_t *watchpoint) +{ + atomic_long_set(watchpoint, INVALID_WATCHPOINT); } static __always_inline struct kcsan_ctx *get_ctx(void) @@ -322,7 +325,8 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, if (consumed) { kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, - KCSAN_REPORT_CONSUMED_WATCHPOINT); + KCSAN_REPORT_CONSUMED_WATCHPOINT, + watchpoint - watchpoints); } else { /* * The other thread may not print any diagnostics, as it has @@ -470,7 +474,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) value_change = KCSAN_VALUE_CHANGE_TRUE; /* Check if this access raced with another. */ - if (!remove_watchpoint(watchpoint)) { + if (!consume_watchpoint(watchpoint)) { /* * Depending on the access type, map a value_change of MAYBE to * TRUE (always report) or FALSE (never report). @@ -500,7 +504,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); - kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL); + kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL, + watchpoint - watchpoints); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ @@ -510,9 +515,15 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, - KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, + watchpoint - watchpoints); } + /* + * Remove watchpoint; must be after reporting, since the slot may be + * reused after this point. + */ + remove_watchpoint(watchpoint); kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: if (!kcsan_interrupt_watcher) diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 6630dfe32f31..763d6d08d94b 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -12,6 +12,7 @@ /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 +#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) extern unsigned int kcsan_udelay_task; extern unsigned int kcsan_udelay_interrupt; @@ -136,6 +137,6 @@ enum kcsan_report_type { */ extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, enum kcsan_value_change value_change, - enum kcsan_report_type type); + enum kcsan_report_type type, int watchpoint_idx); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index de234d1c1b3d..ae0a383238ea 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -30,9 +30,7 @@ struct access_info { /* * Other thread info: communicated from other racing thread to thread that set - * up the watchpoint, which then prints the complete report atomically. Only - * need one struct, as all threads should to be serialized regardless to print - * the reports, with reporting being in the slow-path. + * up the watchpoint, which then prints the complete report atomically. */ struct other_info { struct access_info ai; @@ -59,7 +57,11 @@ struct other_info { struct task_struct *task; }; -static struct other_info other_infos[1]; +/* + * To never block any producers of struct other_info, we need as many elements + * as we have watchpoints (upper bound on concurrent races to report). + */ +static struct other_info other_infos[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; /* * Information about reported races; used to rate limit reporting. @@ -96,10 +98,11 @@ struct report_time { static struct report_time report_times[REPORT_TIMES_SIZE]; /* - * This spinlock protects reporting and other_info, since other_info is usually - * required when reporting. + * Spinlock serializing report generation, and access to @other_infos. Although + * it could make sense to have a finer-grained locking story for @other_infos, + * report generation needs to be serialized either way, so not much is gained. */ -static DEFINE_SPINLOCK(report_lock); +static DEFINE_RAW_SPINLOCK(report_lock); /* * Checks if the race identified by thread frames frame1 and frame2 has @@ -395,9 +398,13 @@ static bool print_report(enum kcsan_value_change value_change, static void release_report(unsigned long *flags, struct other_info *other_info) { if (other_info) - other_info->ai.ptr = NULL; /* Mark for reuse. */ + /* + * Use size to denote valid/invalid, since KCSAN entirely + * ignores 0-sized accesses. + */ + other_info->ai.size = 0; - spin_unlock_irqrestore(&report_lock, *flags); + raw_spin_unlock_irqrestore(&report_lock, *flags); } /* @@ -435,14 +442,14 @@ static void set_other_info_task_blocking(unsigned long *flags, */ set_current_state(TASK_UNINTERRUPTIBLE); } - spin_unlock_irqrestore(&report_lock, *flags); + raw_spin_unlock_irqrestore(&report_lock, *flags); /* * We cannot call schedule() since we also cannot reliably * determine if sleeping here is permitted -- see in_atomic(). */ udelay(1); - spin_lock_irqsave(&report_lock, *flags); + raw_spin_lock_irqsave(&report_lock, *flags); if (timeout-- < 0) { /* * Abort. Reset @other_info->task to NULL, since it @@ -454,128 +461,107 @@ static void set_other_info_task_blocking(unsigned long *flags, break; } /* - * If @ptr nor @current matches, then our information has been - * consumed and we may continue. If not, retry. + * If invalid, or @ptr nor @current matches, then @other_info + * has been consumed and we may continue. If not, retry. */ - } while (other_info->ai.ptr == ai->ptr && other_info->task == current); + } while (other_info->ai.size && other_info->ai.ptr == ai->ptr && + other_info->task == current); if (is_running) set_current_state(TASK_RUNNING); } -/* - * Depending on the report type either sets other_info and returns false, or - * acquires the matching other_info and returns true. If other_info is not - * required for the report type, simply acquires report_lock and returns true. - */ -static bool prepare_report(unsigned long *flags, enum kcsan_report_type type, - const struct access_info *ai, struct other_info *other_info) +/* Populate @other_info; requires that the provided @other_info not in use. */ +static void prepare_report_producer(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) { - if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && - type != KCSAN_REPORT_RACE_SIGNAL) { - /* other_info not required; just acquire report_lock */ - spin_lock_irqsave(&report_lock, *flags); - return true; - } + raw_spin_lock_irqsave(&report_lock, *flags); -retry: - spin_lock_irqsave(&report_lock, *flags); + /* + * The same @other_infos entry cannot be used concurrently, because + * there is a one-to-one mapping to watchpoint slots (@watchpoints in + * core.c), and a watchpoint is only released for reuse after reporting + * is done by the consumer of @other_info. Therefore, it is impossible + * for another concurrent prepare_report_producer() to set the same + * @other_info, and are guaranteed exclusivity for the @other_infos + * entry pointed to by @other_info. + * + * To check this property holds, size should never be non-zero here, + * because every consumer of struct other_info resets size to 0 in + * release_report(). + */ + WARN_ON(other_info->ai.size); - switch (type) { - case KCSAN_REPORT_CONSUMED_WATCHPOINT: - if (other_info->ai.ptr) - break; /* still in use, retry */ + other_info->ai = *ai; + other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 2); - other_info->ai = *ai; - other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 1); + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + set_other_info_task_blocking(flags, ai, other_info); - if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) - set_other_info_task_blocking(flags, ai, other_info); + raw_spin_unlock_irqrestore(&report_lock, *flags); +} - spin_unlock_irqrestore(&report_lock, *flags); +/* Awaits producer to fill @other_info and then returns. */ +static bool prepare_report_consumer(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) +{ - /* - * The other thread will print the summary; other_info may now - * be consumed. - */ - return false; + raw_spin_lock_irqsave(&report_lock, *flags); + while (!other_info->ai.size) { /* Await valid @other_info. */ + raw_spin_unlock_irqrestore(&report_lock, *flags); + cpu_relax(); + raw_spin_lock_irqsave(&report_lock, *flags); + } - case KCSAN_REPORT_RACE_SIGNAL: - if (!other_info->ai.ptr) - break; /* no data available yet, retry */ + /* Should always have a matching access based on watchpoint encoding. */ + if (WARN_ON(!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size, + (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size))) + goto discard; + if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size, + (unsigned long)ai->ptr, ai->size)) { /* - * First check if this is the other_info we are expecting, i.e. - * matches based on how watchpoint was encoded. + * If the actual accesses to not match, this was a false + * positive due to watchpoint encoding. */ - if (!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size, - (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size)) - break; /* mismatching watchpoint, retry */ - - if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size, - (unsigned long)ai->ptr, ai->size)) { - /* - * If the actual accesses to not match, this was a false - * positive due to watchpoint encoding. - */ - kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); - - /* discard this other_info */ - release_report(flags, other_info); - return false; - } + kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + goto discard; + } - if (!((ai->access_type | other_info->ai.access_type) & KCSAN_ACCESS_WRITE)) { - /* - * While the address matches, this is not the other_info - * from the thread that consumed our watchpoint, since - * neither this nor the access in other_info is a write. - * It is invalid to continue with the report, since we - * only have information about reads. - * - * This can happen due to concurrent races on the same - * address, with at least 4 threads. To avoid locking up - * other_info and all other threads, we have to consume - * it regardless. - * - * A concrete case to illustrate why we might lock up if - * we do not consume other_info: - * - * We have 4 threads, all accessing the same address - * (or matching address ranges). Assume the following - * watcher and watchpoint consumer pairs: - * write1-read1, read2-write2. The first to populate - * other_info is write2, however, write1 consumes it, - * resulting in a report of write1-write2. This report - * is valid, however, now read1 populates other_info; - * read2-read1 is an invalid conflict, yet, no other - * conflicting access is left. Therefore, we must - * consume read1's other_info. - * - * Since this case is assumed to be rare, it is - * reasonable to omit this report: one of the other - * reports includes information about the same shared - * data, and at this point the likelihood that we - * re-report the same race again is high. - */ - release_report(flags, other_info); - return false; - } + return true; - /* Matching access in other_info. */ - return true; +discard: + release_report(flags, other_info); + return false; +} +/* + * Depending on the report type either sets @other_info and returns false, or + * awaits @other_info and returns true. If @other_info is not required for the + * report type, simply acquires @report_lock and returns true. + */ +static noinline bool prepare_report(unsigned long *flags, + enum kcsan_report_type type, + const struct access_info *ai, + struct other_info *other_info) +{ + switch (type) { + case KCSAN_REPORT_CONSUMED_WATCHPOINT: + prepare_report_producer(flags, ai, other_info); + return false; + case KCSAN_REPORT_RACE_SIGNAL: + return prepare_report_consumer(flags, ai, other_info); default: - BUG(); + /* @other_info not required; just acquire @report_lock. */ + raw_spin_lock_irqsave(&report_lock, *flags); + return true; } - - spin_unlock_irqrestore(&report_lock, *flags); - - goto retry; } void kcsan_report(const volatile void *ptr, size_t size, int access_type, enum kcsan_value_change value_change, - enum kcsan_report_type type) + enum kcsan_report_type type, int watchpoint_idx) { unsigned long flags = 0; const struct access_info ai = { @@ -586,7 +572,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, .cpu_id = raw_smp_processor_id() }; struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN - ? NULL : &other_infos[0]; + ? NULL : &other_infos[watchpoint_idx]; + + kcsan_disable_current(); + if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= ARRAY_SIZE(other_infos))) + goto out; /* * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if @@ -596,7 +586,6 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, */ lockdep_off(); - kcsan_disable_current(); if (prepare_report(&flags, type, &ai, other_info)) { /* * Never report if value_change is FALSE, only if we it is @@ -611,7 +600,8 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, release_report(&flags, other_info); } - kcsan_enable_current(); lockdep_on(); +out: + kcsan_enable_current(); } -- cgit v1.2.3 From 757a4cefde76697af2b2c284c8a320912b77e7e6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 25 Mar 2020 17:41:56 +0100 Subject: kcsan: Add support for scoped accesses This adds support for scoped accesses, where the memory range is checked for the duration of the scope. The feature is implemented by inserting the relevant access information into a list of scoped accesses for the current execution context, which are then checked (until removed) on every call (through instrumentation) into the KCSAN runtime. An alternative, more complex, implementation could set up a watchpoint for the scoped access, and keep the watchpoint set up. This, however, would require first exposing a handle to the watchpoint, as well as dealing with cases such as accesses by the same thread while the watchpoint is still set up (and several more cases). It is also doubtful if this would provide any benefit, since the majority of delay where the watchpoint is set up is likely due to the injected delays by KCSAN. Therefore, the implementation in this patch is simpler and avoids hurting KCSAN's main use-case (normal data race detection); it also implicitly increases scoped-access race-detection-ability due to increased probability of setting up watchpoints by repeatedly calling __kcsan_check_access() throughout the scope of the access. The implementation required adding an additional conditional branch to the fast-path. However, the microbenchmark showed a *speedup* of ~5% on the fast-path. This appears to be due to subtly improved codegen by GCC from moving get_ctx() and associated load of preempt_count earlier. Suggested-by: Boqun Feng Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++----- kernel/kcsan/report.c | 33 +++++++++++++------- 2 files changed, 97 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4d8ea0fca5f1..a572aae61b98 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { .atomic_nest_count = 0, .in_flat_atomic = false, .access_mask = 0, + .scoped_accesses = {LIST_POISON1, NULL}, }; /* @@ -191,12 +193,23 @@ static __always_inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } +/* Check scoped accesses; never inline because this is a slow-path! */ +static noinline void kcsan_check_scoped_accesses(void) +{ + struct kcsan_ctx *ctx = get_ctx(); + struct list_head *prev_save = ctx->scoped_accesses.prev; + struct kcsan_scoped_access *scoped_access; + + ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */ + list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) + __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type); + ctx->scoped_accesses.prev = prev_save; +} + /* Rules for generic atomic accesses. Called from fast-path. */ static __always_inline bool -is_atomic(const volatile void *ptr, size_t size, int type) +is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx) { - struct kcsan_ctx *ctx; - if (type & KCSAN_ACCESS_ATOMIC) return true; @@ -213,7 +226,6 @@ is_atomic(const volatile void *ptr, size_t size, int type) IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ - ctx = get_ctx(); if (ctx->atomic_next > 0) { /* * Because we do not have separate contexts for nested @@ -233,7 +245,7 @@ is_atomic(const volatile void *ptr, size_t size, int type) } static __always_inline bool -should_watch(const volatile void *ptr, size_t size, int type) +should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx) { /* * Never set up watchpoints when memory operations are atomic. @@ -242,7 +254,7 @@ should_watch(const volatile void *ptr, size_t size, int type) * should not count towards skipped instructions, and (2) to actually * decrement kcsan_atomic_next for consecutive instruction stream. */ - if (is_atomic(ptr, size, type)) + if (is_atomic(ptr, size, type, ctx)) return false; if (this_cpu_dec_return(kcsan_skip) >= 0) @@ -563,8 +575,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, if (unlikely(watchpoint != NULL)) kcsan_found_watchpoint(ptr, size, type, watchpoint, encoded_watchpoint); - else if (unlikely(should_watch(ptr, size, type))) - kcsan_setup_watchpoint(ptr, size, type); + else { + struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */ + + if (unlikely(should_watch(ptr, size, type, ctx))) + kcsan_setup_watchpoint(ptr, size, type); + else if (unlikely(ctx->scoped_accesses.prev)) + kcsan_check_scoped_accesses(); + } } /* === Public interface ===================================================== */ @@ -660,6 +678,55 @@ void kcsan_set_access_mask(unsigned long mask) } EXPORT_SYMBOL(kcsan_set_access_mask); +struct kcsan_scoped_access * +kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type, + struct kcsan_scoped_access *sa) +{ + struct kcsan_ctx *ctx = get_ctx(); + + __kcsan_check_access(ptr, size, type); + + ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */ + + INIT_LIST_HEAD(&sa->list); + sa->ptr = ptr; + sa->size = size; + sa->type = type; + + if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */ + INIT_LIST_HEAD(&ctx->scoped_accesses); + list_add(&sa->list, &ctx->scoped_accesses); + + ctx->disable_count--; + return sa; +} +EXPORT_SYMBOL(kcsan_begin_scoped_access); + +void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) +{ + struct kcsan_ctx *ctx = get_ctx(); + + if (WARN(!ctx->scoped_accesses.prev, "Unbalanced %s()?", __func__)) + return; + + ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */ + + list_del(&sa->list); + if (list_empty(&ctx->scoped_accesses)) + /* + * Ensure we do not enter kcsan_check_scoped_accesses() + * slow-path if unnecessary, and avoids requiring list_empty() + * in the fast-path (to avoid a READ_ONCE() and potential + * uaccess warning). + */ + ctx->scoped_accesses.prev = NULL; + + ctx->disable_count--; + + __kcsan_check_access(sa->ptr, sa->size, sa->type); +} +EXPORT_SYMBOL(kcsan_end_scoped_access); + void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { check_access(ptr, size, type); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ae0a383238ea..ddc18f1224a4 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -205,6 +205,20 @@ skip_report(enum kcsan_value_change value_change, unsigned long top_frame) static const char *get_access_type(int type) { + if (type & KCSAN_ACCESS_ASSERT) { + if (type & KCSAN_ACCESS_SCOPED) { + if (type & KCSAN_ACCESS_WRITE) + return "assert no accesses (scoped)"; + else + return "assert no writes (scoped)"; + } else { + if (type & KCSAN_ACCESS_WRITE) + return "assert no accesses"; + else + return "assert no writes"; + } + } + switch (type) { case 0: return "read"; @@ -214,17 +228,14 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; - - /* - * ASSERT variants: - */ - case KCSAN_ACCESS_ASSERT: - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_ATOMIC: - return "assert no writes"; - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE: - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: - return "assert no accesses"; - + case KCSAN_ACCESS_SCOPED: + return "read (scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: + return "read (marked, scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE: + return "write (scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "write (marked, scoped)"; default: BUG(); } -- cgit v1.2.3 From d8949ef1d9f1062848cd068cf369a57ce33dae6f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 25 Mar 2020 17:41:58 +0100 Subject: kcsan: Introduce scoped ASSERT_EXCLUSIVE macros Introduce ASSERT_EXCLUSIVE_*_SCOPED(), which provide an intuitive interface to use the scoped-access feature, without having to explicitly mark the start and end of the desired scope. Basing duration of the checks on scope avoids accidental misuse and resulting false positives, which may be hard to debug. See added comments for usage. The macros are implemented using __attribute__((__cleanup__(func))), which is supported by all compilers that currently support KCSAN. Suggested-by: Boqun Feng Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 72ee188ebc54..1a08664a7fab 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -110,6 +110,7 @@ static noinline void microbenchmark(unsigned long iters) */ static long test_dummy; static long test_flags; +static long test_scoped; static noinline void test_thread(unsigned long iters) { const long CHANGE_BITS = 0xff00ff00ff00ff00L; @@ -120,7 +121,8 @@ static noinline void test_thread(unsigned long iters) memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); - pr_info("test_dummy@%px, test_flags@%px\n", &test_dummy, &test_flags); + pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n", + &test_dummy, &test_flags, &test_scoped); cycles = get_cycles(); while (iters--) { @@ -141,6 +143,18 @@ static noinline void test_thread(unsigned long iters) test_flags ^= CHANGE_BITS; /* generate value-change */ __kcsan_check_write(&test_flags, sizeof(test_flags)); + + BUG_ON(current->kcsan_ctx.scoped_accesses.prev); + { + /* Should generate reports anywhere in this block. */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped); + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped); + BUG_ON(!current->kcsan_ctx.scoped_accesses.prev); + /* Unrelated accesses. */ + __kcsan_check_access(&cycles, sizeof(cycles), 0); + __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC); + } + BUG_ON(current->kcsan_ctx.scoped_accesses.prev); } cycles = get_cycles() - cycles; -- cgit v1.2.3 From f770ed10a9ee65529f3ec8d90eb374bbd8b7c238 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 10 Apr 2020 18:44:17 +0200 Subject: kcsan: Fix function matching in report Pass string length as returned by scnprintf() to strnstr(), since strnstr() searches exactly len bytes in haystack, even if it contains a NUL-terminator before haystack+len. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/report.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ddc18f1224a4..cf41d63dd0cd 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -192,11 +192,11 @@ skip_report(enum kcsan_value_change value_change, unsigned long top_frame) * maintainers. */ char buf[64]; + int len = scnprintf(buf, sizeof(buf), "%ps", (void *)top_frame); - snprintf(buf, sizeof(buf), "%ps", (void *)top_frame); - if (!strnstr(buf, "rcu_", sizeof(buf)) && - !strnstr(buf, "_rcu", sizeof(buf)) && - !strnstr(buf, "_srcu", sizeof(buf))) + if (!strnstr(buf, "rcu_", len) && + !strnstr(buf, "_rcu", len) && + !strnstr(buf, "_srcu", len)) return true; } @@ -262,15 +262,15 @@ static const char *get_thread_desc(int task_id) static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries) { char buf[64]; + int len; int skip = 0; for (; skip < num_entries; ++skip) { - snprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); - if (!strnstr(buf, "csan_", sizeof(buf)) && - !strnstr(buf, "tsan_", sizeof(buf)) && - !strnstr(buf, "_once_size", sizeof(buf))) { + len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); + if (!strnstr(buf, "csan_", len) && + !strnstr(buf, "tsan_", len) && + !strnstr(buf, "_once_size", len)) break; - } } return skip; } -- cgit v1.2.3 From cdb9b07d8c78be63d72aba9a2686ff161ddd2099 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 10 Apr 2020 18:44:18 +0200 Subject: kcsan: Make reporting aware of KCSAN tests Reporting hides KCSAN runtime functions in the stack trace, with filtering done based on function names. Currently this included all functions (or modules) that would match "kcsan_". Make the filter aware of KCSAN tests, which contain "kcsan_test", and are no longer skipped in the report. This is in preparation for adding a KCSAN test module. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/report.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index cf41d63dd0cd..ac5f8345bae9 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -262,16 +262,32 @@ static const char *get_thread_desc(int task_id) static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries) { char buf[64]; - int len; - int skip = 0; + char *cur; + int len, skip; - for (; skip < num_entries; ++skip) { + for (skip = 0; skip < num_entries; ++skip) { len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); - if (!strnstr(buf, "csan_", len) && - !strnstr(buf, "tsan_", len) && - !strnstr(buf, "_once_size", len)) - break; + + /* Never show tsan_* or {read,write}_once_size. */ + if (strnstr(buf, "tsan_", len) || + strnstr(buf, "_once_size", len)) + continue; + + cur = strnstr(buf, "kcsan_", len); + if (cur) { + cur += sizeof("kcsan_") - 1; + if (strncmp(cur, "test", sizeof("test") - 1)) + continue; /* KCSAN runtime function. */ + /* KCSAN related test. */ + } + + /* + * No match for runtime functions -- @skip entries to skip to + * get to first frame of interest. + */ + break; } + return skip; } -- cgit v1.2.3 From 4c5b566c2193e2af82c891daa5303c8899e61044 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 31 Mar 2020 02:15:44 +0800 Subject: crash_dump: Remove no longer used saved_max_pfn saved_max_pfn was originally introduced in commit 92aa63a5a1bf ("[PATCH] kdump: Retrieve saved max pfn") It used to make sure that the user does not try to read the physical memory beyond saved_max_pfn. But since commit 921d58c0e699 ("vmcore: remove saved_max_pfn check") it's no longer used for the check. This variable doesn't have any users anymore so just remove it. [ bp: Drop the Calgary IOMMU reference from the commit message. ] Signed-off-by: Kairui Song Signed-off-by: Borislav Petkov Acked-by: "Eric W. Biederman" Link: https://lkml.kernel.org/r/20200330181544.1595733-1-kasong@redhat.com --- kernel/crash_dump.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 9c23ae074b40..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -5,12 +5,6 @@ #include #include -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - /* * stores the physical address of elf header of crash image * -- cgit v1.2.3 From 10415533a9062c9e5e27c421d9c232f0cae108fd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 22 Jan 2020 19:36:29 +0000 Subject: gcov: Remove old GCC 3.4 support The kernel requires at least GCC 4.8 in order to build, and so there is no need to cater for the pre-4.7 gcov format. Remove the obsolete code. Acked-by: Peter Oberparleiter Reviewed-by: Nick Desaulniers Signed-off-by: Will Deacon --- kernel/gcov/Kconfig | 24 --- kernel/gcov/Makefile | 3 +- kernel/gcov/gcc_3_4.c | 573 -------------------------------------------------- 3 files changed, 1 insertion(+), 599 deletions(-) delete mode 100644 kernel/gcov/gcc_3_4.c (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3941a9c48f83..feaad597b3f4 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -51,28 +51,4 @@ config GCOV_PROFILE_ALL larger and run slower. Also be sure to exclude files from profiling which are not linked to the kernel image to prevent linker errors. -choice - prompt "Specify GCOV format" - depends on GCOV_KERNEL - depends on CC_IS_GCC - ---help--- - The gcov format is usually determined by the GCC version, and the - default is chosen according to your GCC version. However, there are - exceptions where format changes are integrated in lower-version GCCs. - In such a case, change this option to adjust the format used in the - kernel accordingly. - -config GCOV_FORMAT_3_4 - bool "GCC 3.4 format" - depends on GCC_VERSION < 40700 - ---help--- - Select this option to use the format defined by GCC 3.4. - -config GCOV_FORMAT_4_7 - bool "GCC 4.7 format" - ---help--- - Select this option to use the format defined by GCC 4.7. - -endchoice - endmenu diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index d66a74b0f100..16f8ecc7d882 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,6 +2,5 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c deleted file mode 100644 index acb83558e5df..000000000000 --- a/kernel/gcov/gcc_3_4.c +++ /dev/null @@ -1,573 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code provides functions to handle gcc's profiling data format - * introduced with gcc 3.4. Future versions of gcc may change the gcov - * format (as happened before), so all format-specific information needs - * to be kept modular and easily exchangeable. - * - * This file is based on gcc-internal definitions. Functions and data - * structures are defined to be compatible with gcc counterparts. - * For a better understanding, refer to gcc source: gcc/gcov-io.h. - * - * Copyright IBM Corp. 2009 - * Author(s): Peter Oberparleiter - * - * Uses gcc-internal data definitions. - */ - -#include -#include -#include -#include -#include -#include "gcov.h" - -#define GCOV_COUNTERS 5 - -static struct gcov_info *gcov_info_head; - -/** - * struct gcov_fn_info - profiling meta data per function - * @ident: object file-unique function identifier - * @checksum: function checksum - * @n_ctrs: number of values per counter type belonging to this function - * - * This data is generated by gcc during compilation and doesn't change - * at run-time. - */ -struct gcov_fn_info { - unsigned int ident; - unsigned int checksum; - unsigned int n_ctrs[]; -}; - -/** - * struct gcov_ctr_info - profiling data per counter type - * @num: number of counter values for this type - * @values: array of counter values for this type - * @merge: merge function for counter values of this type (unused) - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the values array. - */ -struct gcov_ctr_info { - unsigned int num; - gcov_type *values; - void (*merge)(gcov_type *, unsigned int); -}; - -/** - * struct gcov_info - profiling data per object file - * @version: gcov version magic indicating the gcc version used for compilation - * @next: list head for a singly-linked list - * @stamp: time stamp - * @filename: name of the associated gcov data file - * @n_functions: number of instrumented functions - * @functions: function data - * @ctr_mask: mask specifying which counter types are active - * @counts: counter data per counter type - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the next pointer. - */ -struct gcov_info { - unsigned int version; - struct gcov_info *next; - unsigned int stamp; - const char *filename; - unsigned int n_functions; - const struct gcov_fn_info *functions; - unsigned int ctr_mask; - struct gcov_ctr_info counts[]; -}; - -/** - * gcov_info_filename - return info filename - * @info: profiling data set - */ -const char *gcov_info_filename(struct gcov_info *info) -{ - return info->filename; -} - -/** - * gcov_info_version - return info version - * @info: profiling data set - */ -unsigned int gcov_info_version(struct gcov_info *info) -{ - return info->version; -} - -/** - * gcov_info_next - return next profiling data set - * @info: profiling data set - * - * Returns next gcov_info following @info or first gcov_info in the chain if - * @info is %NULL. - */ -struct gcov_info *gcov_info_next(struct gcov_info *info) -{ - if (!info) - return gcov_info_head; - - return info->next; -} - -/** - * gcov_info_link - link/add profiling data set to the list - * @info: profiling data set - */ -void gcov_info_link(struct gcov_info *info) -{ - info->next = gcov_info_head; - gcov_info_head = info; -} - -/** - * gcov_info_unlink - unlink/remove profiling data set from the list - * @prev: previous profiling data set - * @info: profiling data set - */ -void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) -{ - if (prev) - prev->next = info->next; - else - gcov_info_head = info->next; -} - -/** - * gcov_info_within_module - check if a profiling data set belongs to a module - * @info: profiling data set - * @mod: module - * - * Returns true if profiling data belongs module, false otherwise. - */ -bool gcov_info_within_module(struct gcov_info *info, struct module *mod) -{ - return within_module((unsigned long)info, mod); -} - -/* Symbolic links to be created for each profiling data file. */ -const struct gcov_link gcov_link[] = { - { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ - { 0, NULL}, -}; - -/* - * Determine whether a counter is active. Based on gcc magic. Doesn't change - * at run-time. - */ -static int counter_active(struct gcov_info *info, unsigned int type) -{ - return (1 << type) & info->ctr_mask; -} - -/* Determine number of active counters. Based on gcc magic. */ -static unsigned int num_counter_active(struct gcov_info *info) -{ - unsigned int i; - unsigned int result = 0; - - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(info, i)) - result++; - } - return result; -} - -/** - * gcov_info_reset - reset profiling data to zero - * @info: profiling data set - */ -void gcov_info_reset(struct gcov_info *info) -{ - unsigned int active = num_counter_active(info); - unsigned int i; - - for (i = 0; i < active; i++) { - memset(info->counts[i].values, 0, - info->counts[i].num * sizeof(gcov_type)); - } -} - -/** - * gcov_info_is_compatible - check if profiling data can be added - * @info1: first profiling data set - * @info2: second profiling data set - * - * Returns non-zero if profiling data can be added, zero otherwise. - */ -int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) -{ - return (info1->stamp == info2->stamp); -} - -/** - * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added - * - * Adds profiling counts of @source to @dest. - */ -void gcov_info_add(struct gcov_info *dest, struct gcov_info *source) -{ - unsigned int i; - unsigned int j; - - for (i = 0; i < num_counter_active(dest); i++) { - for (j = 0; j < dest->counts[i].num; j++) { - dest->counts[i].values[j] += - source->counts[i].values[j]; - } - } -} - -/* Get size of function info entry. Based on gcc magic. */ -static size_t get_fn_size(struct gcov_info *info) -{ - size_t size; - - size = sizeof(struct gcov_fn_info) + num_counter_active(info) * - sizeof(unsigned int); - if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int)) - size = ALIGN(size, __alignof__(struct gcov_fn_info)); - return size; -} - -/* Get address of function info entry. Based on gcc magic. */ -static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn) -{ - return (struct gcov_fn_info *) - ((char *) info->functions + fn * get_fn_size(info)); -} - -/** - * gcov_info_dup - duplicate profiling data set - * @info: profiling data set to duplicate - * - * Return newly allocated duplicate on success, %NULL on error. - */ -struct gcov_info *gcov_info_dup(struct gcov_info *info) -{ - struct gcov_info *dup; - unsigned int i; - unsigned int active; - - /* Duplicate gcov_info. */ - active = num_counter_active(info); - dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL); - if (!dup) - return NULL; - dup->version = info->version; - dup->stamp = info->stamp; - dup->n_functions = info->n_functions; - dup->ctr_mask = info->ctr_mask; - /* Duplicate filename. */ - dup->filename = kstrdup(info->filename, GFP_KERNEL); - if (!dup->filename) - goto err_free; - /* Duplicate table of functions. */ - dup->functions = kmemdup(info->functions, info->n_functions * - get_fn_size(info), GFP_KERNEL); - if (!dup->functions) - goto err_free; - /* Duplicate counter arrays. */ - for (i = 0; i < active ; i++) { - struct gcov_ctr_info *ctr = &info->counts[i]; - size_t size = ctr->num * sizeof(gcov_type); - - dup->counts[i].num = ctr->num; - dup->counts[i].merge = ctr->merge; - dup->counts[i].values = vmalloc(size); - if (!dup->counts[i].values) - goto err_free; - memcpy(dup->counts[i].values, ctr->values, size); - } - return dup; - -err_free: - gcov_info_free(dup); - return NULL; -} - -/** - * gcov_info_free - release memory for profiling data set duplicate - * @info: profiling data set duplicate to free - */ -void gcov_info_free(struct gcov_info *info) -{ - unsigned int active = num_counter_active(info); - unsigned int i; - - for (i = 0; i < active ; i++) - vfree(info->counts[i].values); - kfree(info->functions); - kfree(info->filename); - kfree(info); -} - -/** - * struct type_info - iterator helper array - * @ctr_type: counter type - * @offset: index of the first value of the current function for this type - * - * This array is needed to convert the in-memory data format into the in-file - * data format: - * - * In-memory: - * for each counter type - * for each function - * values - * - * In-file: - * for each function - * for each counter type - * values - * - * See gcc source gcc/gcov-io.h for more information on data organization. - */ -struct type_info { - int ctr_type; - unsigned int offset; -}; - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @record: record type - * @function: function number - * @type: counter type - * @count: index into values array - * @num_types: number of counter types - * @type_info: helper array to get values-array offset for current function - */ -struct gcov_iterator { - struct gcov_info *info; - - int record; - unsigned int function; - unsigned int type; - unsigned int count; - - int num_types; - struct type_info type_info[]; -}; - -static struct gcov_fn_info *get_func(struct gcov_iterator *iter) -{ - return get_fn_info(iter->info, iter->function); -} - -static struct type_info *get_type(struct gcov_iterator *iter) -{ - return &iter->type_info[iter->type]; -} - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)), - GFP_KERNEL); - if (iter) - iter->info = info; - - return iter; -} - -/** - * gcov_iter_free - release memory for iterator - * @iter: file iterator to free - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - int i; - - iter->record = 0; - iter->function = 0; - iter->type = 0; - iter->count = 0; - iter->num_types = 0; - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(iter->info, i)) { - iter->type_info[iter->num_types].ctr_type = i; - iter->type_info[iter->num_types++].offset = 0; - } - } -} - -/* Mapping of logical record number to actual file content. */ -#define RECORD_FILE_MAGIC 0 -#define RECORD_GCOV_VERSION 1 -#define RECORD_TIME_STAMP 2 -#define RECORD_FUNCTION_TAG 3 -#define RECORD_FUNCTON_TAG_LEN 4 -#define RECORD_FUNCTION_IDENT 5 -#define RECORD_FUNCTION_CHECK 6 -#define RECORD_COUNT_TAG 7 -#define RECORD_COUNT_LEN 8 -#define RECORD_COUNT 9 - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - switch (iter->record) { - case RECORD_FILE_MAGIC: - case RECORD_GCOV_VERSION: - case RECORD_FUNCTION_TAG: - case RECORD_FUNCTON_TAG_LEN: - case RECORD_FUNCTION_IDENT: - case RECORD_COUNT_TAG: - /* Advance to next record */ - iter->record++; - break; - case RECORD_COUNT: - /* Advance to next count */ - iter->count++; - /* fall through */ - case RECORD_COUNT_LEN: - if (iter->count < get_func(iter)->n_ctrs[iter->type]) { - iter->record = 9; - break; - } - /* Advance to next counter type */ - get_type(iter)->offset += iter->count; - iter->count = 0; - iter->type++; - /* fall through */ - case RECORD_FUNCTION_CHECK: - if (iter->type < iter->num_types) { - iter->record = 7; - break; - } - /* Advance to next function */ - iter->type = 0; - iter->function++; - /* fall through */ - case RECORD_TIME_STAMP: - if (iter->function < iter->info->n_functions) - iter->record = 3; - else - iter->record = -1; - break; - } - /* Check for EOF. */ - if (iter->record == -1) - return -EINVAL; - else - return 0; -} - -/** - * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file - * @seq: seq_file handle - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. - */ -static int seq_write_gcov_u32(struct seq_file *seq, u32 v) -{ - return seq_write(seq, &v, sizeof(v)); -} - -/** - * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file - * @seq: seq_file handle - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. - */ -static int seq_write_gcov_u64(struct seq_file *seq, u64 v) -{ - u32 data[2]; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - return seq_write(seq, data, sizeof(data)); -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - int rc = -EINVAL; - - switch (iter->record) { - case RECORD_FILE_MAGIC: - rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC); - break; - case RECORD_GCOV_VERSION: - rc = seq_write_gcov_u32(seq, iter->info->version); - break; - case RECORD_TIME_STAMP: - rc = seq_write_gcov_u32(seq, iter->info->stamp); - break; - case RECORD_FUNCTION_TAG: - rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); - break; - case RECORD_FUNCTON_TAG_LEN: - rc = seq_write_gcov_u32(seq, 2); - break; - case RECORD_FUNCTION_IDENT: - rc = seq_write_gcov_u32(seq, get_func(iter)->ident); - break; - case RECORD_FUNCTION_CHECK: - rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); - break; - case RECORD_COUNT_TAG: - rc = seq_write_gcov_u32(seq, - GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type)); - break; - case RECORD_COUNT_LEN: - rc = seq_write_gcov_u32(seq, - get_func(iter)->n_ctrs[iter->type] * 2); - break; - case RECORD_COUNT: - rc = seq_write_gcov_u64(seq, - iter->info->counts[iter->type]. - values[iter->count + get_type(iter)->offset]); - break; - } - return rc; -} -- cgit v1.2.3 From 18aa18566218d4a46d940049b835314d2b071cc2 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:46:24 +0300 Subject: perf/core: Open access to the core for CAP_PERFMON privileged process Open access to monitoring of kernel code, CPUs, tracepoints and namespaces data for a CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons the access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: linux-man@vger.kernel.org Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/471acaef-bb8a-5ce2-923f-90606b78eef9@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..74025b7b83a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11504,7 +11504,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } -- cgit v1.2.3 From c9e0924e5c2b59365f9c0d43ff8722e79ecf4088 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:47:01 +0300 Subject: perf/core: open access to probes for CAP_PERFMON privileged process Open access to monitoring via kprobes and uprobes and eBPF tracing for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. perf kprobes and uprobes are used by ftrace and eBPF. perf probe uses ftrace to define new kprobe events, and those events are treated as tracepoint events. eBPF defines new probes via perf_event_open interface and then the probes are used in eBPF tracing. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Cc: linux-man@vger.kernel.org Link: http://lore.kernel.org/lkml/3c129d9a-ba8a-3483-ecc5-ad6c8e7c203f@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 74025b7b83a0..52951e9e8e1b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9397,7 +9397,7 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9457,7 +9457,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* -- cgit v1.2.3 From 031258da05956646c5606023ab0abe10a7e68ea1 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:48:54 +0300 Subject: trace/bpf_trace: Open access for CAP_PERFMON privileged process Open access to bpf_trace monitoring for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to bpf_trace monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure bpf_trace monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Acked-by: Song Liu Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/c0a0ae47-8b6e-ff3e-416b-3cd1faaf71c0@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..d7d88007dc6d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1468,7 +1468,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; -- cgit v1.2.3 From db991af02f11053558431467102ee5832894d7a4 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 8 Apr 2020 16:31:06 +0200 Subject: module: break nested ARCH_HAS_STRICT_MODULE_RWX and STRICT_MODULE_RWX #ifdefs Various frob_* and module_{enable,disable}_* functions are defined in a CONFIG_ARCH_HAS_STRICT_MODULE_RWX ifdef block which also has a nested CONFIG_STRICT_MODULE_RWX ifdef block within it. This is unecessary and makes things hard to read. Not only that, this construction requires redundant empty stubs for module_enable_nx(). I suspect this was originally done for cosmetic reasons - to keep all the frob_* functions in the same place, and all the module_{enable,disable}_* functions right after, but as a result it made the code harder to read. Make this more readable by unnesting the ifdef blocks and getting rid of the redundant empty stubs. Signed-off-by: Jessica Yu --- kernel/module.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..01d01a489778 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1943,7 +1943,6 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. @@ -1957,6 +1956,14 @@ static void mod_sysfs_teardown(struct module *mod) * * These values are always page-aligned (as is base) */ + +/* + * Since some arches are moving towards PAGE_KERNEL module allocations instead + * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the + * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of + * whether we are strict. + */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX static void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { @@ -1966,6 +1973,15 @@ static void frob_text(const struct module_layout *layout, layout->text_size >> PAGE_SHIFT); } +static void module_enable_x(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} +#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ +static void module_enable_x(const struct module *mod) { } +#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ + #ifdef CONFIG_STRICT_MODULE_RWX static void frob_rodata(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) @@ -2037,18 +2053,9 @@ static void module_enable_nx(const struct module *mod) } #else /* !CONFIG_STRICT_MODULE_RWX */ +/* module_{enable,disable}_ro() stubs are in module.h */ static void module_enable_nx(const struct module *mod) { } #endif /* CONFIG_STRICT_MODULE_RWX */ -static void module_enable_x(const struct module *mod) -{ - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); -} -#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -static void module_enable_nx(const struct module *mod) { } -static void module_enable_x(const struct module *mod) { } -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ - #ifdef CONFIG_LIVEPATCH /* -- cgit v1.2.3 From 58eb7b77ad01f058e523554cb7bae7272a7d2579 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Sat, 18 Apr 2020 00:24:51 +0800 Subject: smp: Use smp_call_func_t in on_each_cpu() Use smp_call_func_t instead of the open coded function pointer argument. Signed-off-by: Kaitao Cheng Signed-off-by: Thomas Gleixner Acked-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20200417162451.91969-1-pilgrimtao@gmail.com --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 786092aabdcd..84303197caf9 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -620,7 +620,7 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -void on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; -- cgit v1.2.3 From 05f099a7d0a73114c6eb3e6a359ea97563b47031 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 17 Apr 2020 15:36:05 +0800 Subject: dma-debug: make __dma_entry_alloc_check_leak() static Fix the following sparse warning: kernel/dma/debug.c:659:6: warning: symbol '__dma_entry_alloc_check_leak' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 9e1777c81f55..36c962a86bf2 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -656,7 +656,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -void __dma_entry_alloc_check_leak(void) +static void __dma_entry_alloc_check_leak(void) { u32 tmp = nr_total_entries % nr_prealloc_entries; -- cgit v1.2.3 From e860c299ac0d738b44ff91693f11e63080a29698 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:52 -0700 Subject: dma-remap: separate DMA atomic pools from direct remap code DMA atomic pools will be needed beyond only CONFIG_DMA_DIRECT_REMAP so separate them out into their own file. This also adds a new Kconfig option that can be subsequently used for options, such as CONFIG_AMD_MEM_ENCRYPT, that will utilize the coherent pools but do not have a dependency on direct remapping. For this patch alone, there is no functional change introduced. Reviewed-by: Christoph Hellwig Signed-off-by: David Rientjes [hch: fixup copyrights and remove unused includes] Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 6 ++- kernel/dma/Makefile | 1 + kernel/dma/pool.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/remap.c | 121 +-------------------------------------------------- 4 files changed, 130 insertions(+), 121 deletions(-) create mode 100644 kernel/dma/pool.c (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 4c103a24e380..d006668c0027 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -79,10 +79,14 @@ config DMA_REMAP select DMA_NONCOHERENT_MMAP bool -config DMA_DIRECT_REMAP +config DMA_COHERENT_POOL bool select DMA_REMAP +config DMA_DIRECT_REMAP + bool + select DMA_COHERENT_POOL + config DMA_CMA bool "DMA Contiguous Memory Allocator" depends on HAVE_DMA_CONTIGUOUS && CMA diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index d237cf3dc181..370f63344e9c 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_DMA_REMAP) += remap.o diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c new file mode 100644 index 000000000000..3df5d9d39922 --- /dev/null +++ b/kernel/dma/pool.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 ARM Ltd. + * Copyright (C) 2020 Google LLC + */ +#include +#include +#include +#include +#include +#include + +static struct gen_pool *atomic_pool __ro_after_init; + +#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K +static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; + +static int __init early_coherent_pool(char *p) +{ + atomic_pool_size = memparse(p, &p); + return 0; +} +early_param("coherent_pool", early_coherent_pool); + +static gfp_t dma_atomic_pool_gfp(void) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + return GFP_DMA; + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + return GFP_DMA32; + return GFP_KERNEL; +} + +static int __init dma_atomic_pool_init(void) +{ + unsigned int pool_size_order = get_order(atomic_pool_size); + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + struct page *page; + void *addr; + int ret; + + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, nr_pages, + pool_size_order, false); + else + page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); + if (!page) + goto out; + + arch_dma_prep_coherent(page, atomic_pool_size); + + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); + if (!atomic_pool) + goto free_page; + + addr = dma_common_contiguous_remap(page, atomic_pool_size, + pgprot_dmacoherent(PAGE_KERNEL), + __builtin_return_address(0)); + if (!addr) + goto destroy_genpool; + + ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, + page_to_phys(page), atomic_pool_size, -1); + if (ret) + goto remove_mapping; + gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); + + pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", + atomic_pool_size / 1024); + return 0; + +remove_mapping: + dma_common_free_remap(addr, atomic_pool_size); +destroy_genpool: + gen_pool_destroy(atomic_pool); + atomic_pool = NULL; +free_page: + if (!dma_release_from_contiguous(NULL, page, nr_pages)) + __free_pages(page, pool_size_order); +out: + pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", + atomic_pool_size / 1024); + return -ENOMEM; +} +postcore_initcall(dma_atomic_pool_init); + +bool dma_in_atomic_pool(void *start, size_t size) +{ + if (unlikely(!atomic_pool)) + return false; + + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); +} + +void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) +{ + unsigned long val; + void *ptr = NULL; + + if (!atomic_pool) { + WARN(1, "coherent pool not initialised!\n"); + return NULL; + } + + val = gen_pool_alloc(atomic_pool, size); + if (val) { + phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + + *ret_page = pfn_to_page(__phys_to_pfn(phys)); + ptr = (void *)val; + memset(ptr, 0, size); + } + + return ptr; +} + +bool dma_free_from_pool(void *start, size_t size) +{ + if (!dma_in_atomic_pool(start, size)) + return false; + gen_pool_free(atomic_pool, (unsigned long)start, size); + return true; +} diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..f7b402849891 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -1,13 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2012 ARM Ltd. * Copyright (c) 2014 The Linux Foundation */ -#include -#include -#include -#include -#include +#include #include #include @@ -97,117 +92,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size) unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } - -#ifdef CONFIG_DMA_DIRECT_REMAP -static struct gen_pool *atomic_pool __ro_after_init; - -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; - -static int __init early_coherent_pool(char *p) -{ - atomic_pool_size = memparse(p, &p); - return 0; -} -early_param("coherent_pool", early_coherent_pool); - -static gfp_t dma_atomic_pool_gfp(void) -{ - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; - struct page *page; - void *addr; - int ret; - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); - else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); - if (!page) - goto out; - - arch_dma_prep_coherent(page, atomic_pool_size); - - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) - goto free_page; - - addr = dma_common_contiguous_remap(page, atomic_pool_size, - pgprot_dmacoherent(PAGE_KERNEL), - __builtin_return_address(0)); - if (!addr) - goto destroy_genpool; - - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); - if (ret) - goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); - - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); - return 0; - -remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); -destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; -free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); -out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); - return -ENOMEM; -} -postcore_initcall(dma_atomic_pool_init); - -bool dma_in_atomic_pool(void *start, size_t size) -{ - if (unlikely(!atomic_pool)) - return false; - - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); -} - -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) -{ - unsigned long val; - void *ptr = NULL; - - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); - return NULL; - } - - val = gen_pool_alloc(atomic_pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); - - *ret_page = pfn_to_page(__phys_to_pfn(phys)); - ptr = (void *)val; - memset(ptr, 0, size); - } - - return ptr; -} - -bool dma_free_from_pool(void *start, size_t size) -{ - if (!dma_in_atomic_pool(start, size)) - return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); - return true; -} -#endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit v1.2.3 From c84dc6e68a1d2464e050d9694be4e4ff49e32bfd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:55 -0700 Subject: dma-pool: add additional coherent pools to map to gfp mask The single atomic pool is allocated from the lowest zone possible since it is guaranteed to be applicable for any DMA allocation. Devices may allocate through the DMA API but not have a strict reliance on GFP_DMA memory. Since the atomic pool will be used for all non-blockable allocations, returning all memory from ZONE_DMA may unnecessarily deplete the zone. Provision for multiple atomic pools that will map to the optimal gfp mask of the device. When allocating non-blockable memory, determine the optimal gfp mask of the device and use the appropriate atomic pool. The coherent DMA mask will remain the same between allocation and free and, thus, memory will be freed to the same atomic pool it was allocated from. __dma_atomic_pool_init() will be changed to return struct gen_pool * later once dynamic expansion is added. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 12 +++--- kernel/dma/pool.c | 120 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 83 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f4bbdaf965e..a834ee22f8ff 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -45,8 +45,8 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_limit) +gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); @@ -89,8 +89,8 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_limit); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_limit); page = dma_alloc_contiguous(dev, alloc_size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, alloc_size); @@ -128,7 +128,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs) && !gfpflags_allow_blocking(gfp)) { - ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); if (!ret) return NULL; goto done; @@ -212,7 +212,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, } if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; if (force_dma_unencrypted(dev)) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 3df5d9d39922..db4f89ac5f5f 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -10,7 +10,9 @@ #include #include -static struct gen_pool *atomic_pool __ro_after_init; +static struct gen_pool *atomic_pool_dma __ro_after_init; +static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static struct gen_pool *atomic_pool_kernel __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -22,89 +24,119 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -static gfp_t dma_atomic_pool_gfp(void) +static int __init __dma_atomic_pool_init(struct gen_pool **pool, + size_t pool_size, gfp_t gfp) { - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + const unsigned int order = get_order(pool_size); + const unsigned long nr_pages = pool_size >> PAGE_SHIFT; struct page *page; void *addr; int ret; if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); + page = dma_alloc_from_contiguous(NULL, nr_pages, order, false); else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); + page = alloc_pages(gfp, order); if (!page) goto out; - arch_dma_prep_coherent(page, atomic_pool_size); + arch_dma_prep_coherent(page, pool_size); - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) + *pool = gen_pool_create(PAGE_SHIFT, -1); + if (!*pool) goto free_page; - addr = dma_common_contiguous_remap(page, atomic_pool_size, + addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) goto destroy_genpool; - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); + ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page), + pool_size, -1); if (ret) goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); + gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL); - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + pool_size >> 10, &gfp); return 0; remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); + dma_common_free_remap(addr, pool_size); destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; + gen_pool_destroy(*pool); + *pool = NULL; free_page: if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); + __free_pages(page, order); out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); return -ENOMEM; } + +static int __init dma_atomic_pool_init(void) +{ + int ret = 0; + int err; + + ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, + GFP_KERNEL); + if (IS_ENABLED(CONFIG_ZONE_DMA)) { + err = __dma_atomic_pool_init(&atomic_pool_dma, + atomic_pool_size, GFP_DMA); + if (!ret && err) + ret = err; + } + if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + err = __dma_atomic_pool_init(&atomic_pool_dma32, + atomic_pool_size, GFP_DMA32); + if (!ret && err) + ret = err; + } + return ret; +} postcore_initcall(dma_atomic_pool_init); -bool dma_in_atomic_pool(void *start, size_t size) +static inline struct gen_pool *dev_to_pool(struct device *dev) { - if (unlikely(!atomic_pool)) - return false; + u64 phys_mask; + gfp_t gfp; + + gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + return atomic_pool_dma; + if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) + return atomic_pool_dma32; + return atomic_pool_kernel; +} - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); +static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) +{ + struct gen_pool *pool = dev_to_pool(dev); + + if (unlikely(!pool)) + return false; + return gen_pool_has_addr(pool, (unsigned long)start, size); } -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) +void *dma_alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page, gfp_t flags) { + struct gen_pool *pool = dev_to_pool(dev); unsigned long val; void *ptr = NULL; - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); + if (!pool) { + WARN(1, "%pGg atomic pool not initialised!\n", &flags); return NULL; } - val = gen_pool_alloc(atomic_pool, size); + val = gen_pool_alloc(pool, size); if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + phys_addr_t phys = gen_pool_virt_to_phys(pool, val); *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; @@ -114,10 +146,12 @@ void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) return ptr; } -bool dma_free_from_pool(void *start, size_t size) +bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - if (!dma_in_atomic_pool(start, size)) + struct gen_pool *pool = dev_to_pool(dev); + + if (!dma_in_atomic_pool(dev, start, size)) return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); + gen_pool_free(pool, (unsigned long)start, size); return true; } -- cgit v1.2.3 From 0c1bc6b84525b96aa9fb8f6fbe8c5cb26a5c0ead Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:37 +0200 Subject: docs: filesystems: fix renamed references Some filesystem references got broken by a previous patch series I submitted. Address those. Signed-off-by: Mauro Carvalho Chehab Acked-by: David Sterba # fs/affs/Kconfig Link: https://lore.kernel.org/r/57318c53008dbda7f6f4a5a9e5787f4d37e8565a.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..d0c9c287680a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relay.txt for an overview. + * See Documentation/filesystems/relay.rst for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) -- cgit v1.2.3 From 03c109d66867767ddbb0fe09223410a79193f5ea Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:58 +0200 Subject: futex: get rid of a kernel-docs build warning Adjust whitespaces and blank lines in order to get rid of this: ./kernel/futex.c:491: WARNING: Definition list ends without a blank line; unexpected unindent. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/57788af7889161483e0c97f91c079cfb3986c4b3.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- kernel/futex.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b59532862bc0..b4b9f960b610 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -486,10 +486,13 @@ static u64 get_inode_sequence_number(struct inode *inode) * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: + * * ( inode->i_sequence, page->index, offset_within_page ) + * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: + * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex -- cgit v1.2.3 From a48b284b403a4a073d8beb72d2bb33e54df67fb6 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 20 Apr 2020 10:09:29 -0400 Subject: audit: fix a net reference leak in audit_send_reply() If audit_send_reply() fails when trying to create a new thread to send the reply it also fails to cleanup properly, leaking a reference to a net structure. This patch fixes the error path and makes a handful of other cleanups that came up while fixing the code. Reported-by: teroincn@gmail.com Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b69c8b460341..66b81358b64f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -924,19 +924,30 @@ out_kfree_skb: return NULL; } +static void audit_free_reply(struct audit_reply *reply) +{ + if (!reply) + return; + + if (reply->skb) + kfree_skb(reply->skb); + if (reply->net) + put_net(reply->net); + kfree(reply); +} + static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct sock *sk = audit_get_sk(reply->net); audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ - netlink_unicast(sk, reply->skb, reply->portid, 0); - put_net(reply->net); - kfree(reply); + netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); + reply->skb = NULL; + audit_free_reply(reply); return 0; } @@ -950,35 +961,32 @@ static int audit_send_reply_thread(void *arg) * @payload: payload data * @size: payload size * - * Allocates an skb, builds the netlink message, and sends it to the port id. - * No failure notifications. + * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { - struct net *net = sock_net(NETLINK_CB(request_skb).sk); - struct sk_buff *skb; struct task_struct *tsk; - struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), - GFP_KERNEL); + struct audit_reply *reply; + reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; - skb = audit_make_reply(seq, type, done, multi, payload, size); - if (!skb) - goto out; - - reply->net = get_net(net); + reply->skb = audit_make_reply(seq, type, done, multi, payload, size); + if (!reply->skb) + goto err; + reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; - reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (!IS_ERR(tsk)) - return; - kfree_skb(skb); -out: - kfree(reply); + if (IS_ERR(tsk)) + goto err; + + return; + +err: + audit_free_reply(reply); } /* -- cgit v1.2.3 From 5c3a7db0c7ec4bbd5bd3f48af9be859a8fa3e532 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Apr 2020 19:13:03 +0200 Subject: module: Harden STRICT_MODULE_RWX We're very close to enforcing W^X memory, refuse to load modules that violate this principle per construction. [jeyu: move module_enforce_rwx_sections under STRICT_MODULE_RWX as per discussion] Link: http://lore.kernel.org/r/20200403171303.GK20760@hirez.programming.kicks-ass.net Acked-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Jessica Yu --- kernel/module.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 01d01a489778..70fc20583e66 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2052,9 +2052,28 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; + int i; + + for (i = 0; i < hdr->e_shnum; i++) { + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) + return -ENOEXEC; + } + + return 0; +} + #else /* !CONFIG_STRICT_MODULE_RWX */ /* module_{enable,disable}_ro() stubs are in module.h */ static void module_enable_nx(const struct module *mod) { } +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + return 0; +} #endif /* CONFIG_STRICT_MODULE_RWX */ #ifdef CONFIG_LIVEPATCH @@ -3385,6 +3404,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (err < 0) return ERR_PTR(err); + err = module_enforce_rwx_sections(info->hdr, info->sechdrs, + info->secstrings, info->mod); + if (err < 0) + return ERR_PTR(err); + /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; -- cgit v1.2.3 From 51161bfc66a68d21f13d15a689b3ea7980457790 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 18:55:06 +0300 Subject: kernel/module: Hide vermagic header file from general use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VERMAGIC* definitions are not supposed to be used by the drivers, see this [1] bug report, so introduce special define to guard inclusion of this header file and define it in kernel/modules.h and in internal script that generates *.mod.c files. In-tree module build: ➜ kernel git:(vermagic) ✗ make clean ➜ kernel git:(vermagic) ✗ make M=drivers/infiniband/hw/mlx5 ➜ kernel git:(vermagic) ✗ modinfo drivers/infiniband/hw/mlx5/mlx5_ib.ko filename: /images/leonro/src/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.ko <...> vermagic: 5.6.0+ SMP mod_unload modversions Out-of-tree module build: ➜ mlx5 make -C /images/leonro/src/kernel clean M=/tmp/mlx5 ➜ mlx5 make -C /images/leonro/src/kernel M=/tmp/mlx5 ➜ mlx5 modinfo /tmp/mlx5/mlx5_ib.ko filename: /tmp/mlx5/mlx5_ib.ko <...> vermagic: 5.6.0+ SMP mod_unload modversions [1] https://lore.kernel.org/lkml/20200411155623.GA22175@zn.tnic Reported-by: Borislav Petkov Acked-by: Borislav Petkov Acked-by: Jessica Yu Co-developed-by: Masahiro Yamada Signed-off-by: Masahiro Yamada Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller --- kernel/module.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..8833e848b73c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4,6 +4,9 @@ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. */ + +#define INCLUDE_VERMAGIC + #include #include #include -- cgit v1.2.3 From 3054d06719079388a543de6adb812638675ad8f5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 21 Apr 2020 09:10:56 -0400 Subject: audit: fix a net reference leak in audit_list_rules_send() If audit_list_rules_send() fails when trying to create a new thread to send the rules it also fails to cleanup properly, leaking a reference to a net structure. This patch fixes the error patch and renames audit_send_list() to audit_send_list_thread() to better match its cousin, audit_send_reply_thread(). Reported-by: teroincn@gmail.com Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/audit.h | 2 +- kernel/auditfilter.c | 16 +++++++--------- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 66b81358b64f..622c30246d19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -880,7 +880,7 @@ main_queue: return 0; } -int audit_send_list(void *_dest) +int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; diff --git a/kernel/audit.h b/kernel/audit.h index 2eed4d231624..f0233dc40b17 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -229,7 +229,7 @@ struct audit_netlink_list { struct sk_buff_head q; }; -int audit_send_list(void *_dest); +int audit_send_list_thread(void *_dest); extern int selinux_audit_rule_update(void); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 026e34da4ace..a10e2997aa6c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1161,11 +1161,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) { - u32 portid = NETLINK_CB(request_skb).portid; - struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; - int err = 0; /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for @@ -1173,25 +1170,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq) * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); + dest = kmalloc(sizeof(*dest), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(net); - dest->portid = portid; + dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); + dest->portid = NETLINK_CB(request_skb).portid; skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); - tsk = kthread_run(audit_send_list, dest, "audit_send_list"); + tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list"); if (IS_ERR(tsk)) { skb_queue_purge(&dest->q); + put_net(dest->net); kfree(dest); - err = PTR_ERR(tsk); + return PTR_ERR(tsk); } - return err; + return 0; } int audit_comparator(u32 left, u32 op, u32 right) -- cgit v1.2.3 From 788f87ac608c518b74f338acb95f197cf6e3d0c4 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 22 Apr 2020 15:05:09 +0300 Subject: xdp: export the DEV_MAP_BULK_SIZE macro Export the DEV_MAP_BULK_SIZE macro to the header file so that drivers can directly use it as the maximum number of xdp_frames received in the .ndo_xdp_xmit() callback. Signed-off-by: Ioana Ciornei Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 58bdca5d978a..a51d9fb7a359 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -52,7 +52,6 @@ #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -#define DEV_MAP_BULK_SIZE 16 struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; -- cgit v1.2.3 From 54adadf9b08571fb8b11dc9d0d3a2ddd39825efd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 20 Apr 2020 12:09:58 +0200 Subject: dma-pool: dynamically expanding atomic pools When an atomic pool becomes fully depleted because it is now relied upon for all non-blocking allocations through the DMA API, allow background expansion of each pool by a kworker. When an atomic pool has less than the default size of memory left, kick off a kworker to dynamically expand the pool in the background. The pool is doubled in size, up to MAX_ORDER-1. If memory cannot be allocated at the requested order, smaller allocation(s) are attempted. This allows the default size to be kept quite low when one or more of the atomic pools is not used. Allocations for lowmem should also use GFP_KERNEL for the benefits of reclaim, so use GFP_KERNEL | GFP_DMA and GFP_KERNEL | GFP_DMA32 for lowmem allocations. This also allows __dma_atomic_pool_init() to return a pointer to the pool to make initialization cleaner. Also switch over some node ids to the more appropriate NUMA_NO_NODE. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 122 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 84 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index db4f89ac5f5f..ffe866c2c034 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -9,13 +9,17 @@ #include #include #include +#include static struct gen_pool *atomic_pool_dma __ro_after_init; static struct gen_pool *atomic_pool_dma32 __ro_after_init; static struct gen_pool *atomic_pool_kernel __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; +static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; + +/* Dynamic background expansion when the atomic pool is near capacity */ +static struct work_struct atomic_pool_work; static int __init early_coherent_pool(char *p) { @@ -24,76 +28,116 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -static int __init __dma_atomic_pool_init(struct gen_pool **pool, - size_t pool_size, gfp_t gfp) +static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, + gfp_t gfp) { - const unsigned int order = get_order(pool_size); - const unsigned long nr_pages = pool_size >> PAGE_SHIFT; + unsigned int order; struct page *page; void *addr; - int ret; + int ret = -ENOMEM; + + /* Cannot allocate larger than MAX_ORDER-1 */ + order = min(get_order(pool_size), MAX_ORDER-1); + + do { + pool_size = 1 << (PAGE_SHIFT + order); - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, order, false); - else - page = alloc_pages(gfp, order); + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, 1 << order, + order, false); + else + page = alloc_pages(gfp, order); + } while (!page && order-- > 0); if (!page) goto out; arch_dma_prep_coherent(page, pool_size); - *pool = gen_pool_create(PAGE_SHIFT, -1); - if (!*pool) - goto free_page; - addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) - goto destroy_genpool; + goto free_page; - ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page), - pool_size, -1); + ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page), + pool_size, NUMA_NO_NODE); if (ret) goto remove_mapping; - gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL); - pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", - pool_size >> 10, &gfp); return 0; remove_mapping: dma_common_free_remap(addr, pool_size); -destroy_genpool: - gen_pool_destroy(*pool); - *pool = NULL; free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) + if (!dma_release_from_contiguous(NULL, page, 1 << order)) __free_pages(page, order); out: - pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", - pool_size >> 10, &gfp); - return -ENOMEM; + return ret; +} + +static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp) +{ + if (pool && gen_pool_avail(pool) < atomic_pool_size) + atomic_pool_expand(pool, gen_pool_size(pool), gfp); +} + +static void atomic_pool_work_fn(struct work_struct *work) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + atomic_pool_resize(atomic_pool_dma, + GFP_KERNEL | GFP_DMA); + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + atomic_pool_resize(atomic_pool_dma32, + GFP_KERNEL | GFP_DMA32); + atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL); +} + +static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size, + gfp_t gfp) +{ + struct gen_pool *pool; + int ret; + + pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE); + if (!pool) + return NULL; + + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + ret = atomic_pool_expand(pool, pool_size, gfp); + if (ret) { + gen_pool_destroy(pool); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); + return NULL; + } + + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + gen_pool_size(pool) >> 10, &gfp); + return pool; } static int __init dma_atomic_pool_init(void) { int ret = 0; - int err; - ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, - GFP_KERNEL); + INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); + + atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL); + if (!atomic_pool_kernel) + ret = -ENOMEM; if (IS_ENABLED(CONFIG_ZONE_DMA)) { - err = __dma_atomic_pool_init(&atomic_pool_dma, - atomic_pool_size, GFP_DMA); - if (!ret && err) - ret = err; + atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA); + if (!atomic_pool_dma) + ret = -ENOMEM; } if (IS_ENABLED(CONFIG_ZONE_DMA32)) { - err = __dma_atomic_pool_init(&atomic_pool_dma32, - atomic_pool_size, GFP_DMA32); - if (!ret && err) - ret = err; + atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA32); + if (!atomic_pool_dma32) + ret = -ENOMEM; } return ret; } @@ -142,6 +186,8 @@ void *dma_alloc_from_pool(struct device *dev, size_t size, ptr = (void *)val; memset(ptr, 0, size); } + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); return ptr; } -- cgit v1.2.3 From 76a19940bd62a81148c303f3df6d0cee9ae4b509 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:58 -0700 Subject: dma-direct: atomic allocations must come from atomic coherent pools When a device requires unencrypted memory and the context does not allow blocking, memory must be returned from the atomic coherent pools. This avoids the remap when CONFIG_DMA_DIRECT_REMAP is not enabled and the config only requires CONFIG_DMA_COHERENT_POOL. This will be used for CONFIG_AMD_MEM_ENCRYPT in a subsequent patch. Keep all memory in these pools unencrypted. When set_memory_decrypted() fails, this prohibits the memory from being added. If adding memory to the genpool fails, and set_memory_encrypted() subsequently fails, there is no alternative other than leaking the memory. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 46 +++++++++++++++++++++++++++++++++++++++------- kernel/dma/pool.c | 27 ++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a834ee22f8ff..0a4881e59aa7 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -76,6 +76,39 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +/* + * Decrypting memory is allowed to block, so if this device requires + * unencrypted memory it must come from atomic pools. + */ +static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp, + unsigned long attrs) +{ + if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return false; + if (gfpflags_allow_blocking(gfp)) + return false; + if (force_dma_unencrypted(dev)) + return true; + if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return false; + if (dma_alloc_need_uncached(dev, attrs)) + return true; + return false; +} + +static inline bool dma_should_free_from_pool(struct device *dev, + unsigned long attrs) +{ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return true; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) + return false; + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return true; + return false; +} + struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { @@ -125,9 +158,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs) && - !gfpflags_allow_blocking(gfp)) { + if (dma_should_alloc_from_pool(dev, gfp, attrs)) { ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); if (!ret) return NULL; @@ -204,6 +235,11 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (dma_should_free_from_pool(dev, attrs) && + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) + return; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ @@ -211,10 +247,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) - return; - if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index ffe866c2c034..c8d61b3a7bd6 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -53,22 +54,42 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, arch_dma_prep_coherent(page, pool_size); +#ifdef CONFIG_DMA_DIRECT_REMAP addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) goto free_page; - +#else + addr = page_to_virt(page); +#endif + /* + * Memory in the atomic DMA pools must be unencrypted, the pools do not + * shrink so no re-encryption occurs in dma_direct_free_pages(). + */ + ret = set_memory_decrypted((unsigned long)page_to_virt(page), + 1 << order); + if (ret) + goto remove_mapping; ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page), pool_size, NUMA_NO_NODE); if (ret) - goto remove_mapping; + goto encrypt_mapping; return 0; +encrypt_mapping: + ret = set_memory_encrypted((unsigned long)page_to_virt(page), + 1 << order); + if (WARN_ON_ONCE(ret)) { + /* Decrypt succeeded but encrypt failed, purposely leak */ + goto out; + } remove_mapping: +#ifdef CONFIG_DMA_DIRECT_REMAP dma_common_free_remap(addr, pool_size); -free_page: +#endif +free_page: __maybe_unused if (!dma_release_from_contiguous(NULL, page, 1 << order)) __free_pages(page, order); out: -- cgit v1.2.3 From 2edc5bb3c5cc42131438460a50b7b16905c81c2a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:59 -0700 Subject: dma-pool: add pool sizes to debugfs The atomic DMA pools can dynamically expand based on non-blocking allocations that need to use it. Export the sizes of each of these pools, in bytes, through debugfs for measurement. Suggested-by: Christoph Hellwig Signed-off-by: David Rientjes [hch: remove the !CONFIG_DEBUG_FS stubs] Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index c8d61b3a7bd6..dde6de7f8e83 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -3,6 +3,7 @@ * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2020 Google LLC */ +#include #include #include #include @@ -13,8 +14,11 @@ #include static struct gen_pool *atomic_pool_dma __ro_after_init; +static unsigned long pool_size_dma; static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static unsigned long pool_size_dma32; static struct gen_pool *atomic_pool_kernel __ro_after_init; +static unsigned long pool_size_kernel; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -29,6 +33,29 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); +static void __init dma_atomic_pool_debugfs_init(void) +{ + struct dentry *root; + + root = debugfs_create_dir("dma_pools", NULL); + if (IS_ERR_OR_NULL(root)) + return; + + debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma); + debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32); + debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel); +} + +static void dma_atomic_pool_size_add(gfp_t gfp, size_t size) +{ + if (gfp & __GFP_DMA) + pool_size_dma += size; + else if (gfp & __GFP_DMA32) + pool_size_dma32 += size; + else + pool_size_kernel += size; +} + static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, gfp_t gfp) { @@ -76,6 +103,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, if (ret) goto encrypt_mapping; + dma_atomic_pool_size_add(gfp, pool_size); return 0; encrypt_mapping: @@ -160,6 +188,8 @@ static int __init dma_atomic_pool_init(void) if (!atomic_pool_dma32) ret = -ENOMEM; } + + dma_atomic_pool_debugfs_init(); return ret; } postcore_initcall(dma_atomic_pool_init); -- cgit v1.2.3 From 1d659236fb43c4d2b37af7a4309681e834e9ec9a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:05:02 -0700 Subject: dma-pool: scale the default DMA coherent pool size with memory capacity When AMD memory encryption is enabled, some devices may use more than 256KB/sec from the atomic pools. It would be more appropriate to scale the default size based on memory capacity unless the coherent_pool option is used on the kernel command line. This provides a slight optimization on initial expansion and is deemed appropriate due to the increased reliance on the atomic pools. Note that the default size of 128KB per pool will normally be larger than the single coherent pool implementation since there are now up to three coherent pools (DMA, DMA32, and kernel). Note that even prior to this patch, coherent_pool= for sizes larger than 1 << (PAGE_SHIFT + MAX_ORDER-1) can fail. With new dynamic expansion support, this would be trivially extensible to allow even larger initial sizes. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index dde6de7f8e83..35bb51c31fff 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -20,8 +20,8 @@ static unsigned long pool_size_dma32; static struct gen_pool *atomic_pool_kernel __ro_after_init; static unsigned long pool_size_kernel; -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; +/* Size can be defined by the coherent_pool command line */ +static size_t atomic_pool_size; /* Dynamic background expansion when the atomic pool is near capacity */ static struct work_struct atomic_pool_work; @@ -170,6 +170,16 @@ static int __init dma_atomic_pool_init(void) { int ret = 0; + /* + * If coherent_pool was not used on the command line, default the pool + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. + */ + if (!atomic_pool_size) { + atomic_pool_size = max(totalram_pages() >> PAGE_SHIFT, 1UL) * + SZ_128K; + atomic_pool_size = min_t(size_t, atomic_pool_size, + 1 << (PAGE_SHIFT + MAX_ORDER-1)); + } INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, -- cgit v1.2.3 From 298f3db6ee690259927b105d5ad1079563361323 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 23 Apr 2020 09:31:31 -0700 Subject: dma-contiguous: fix comment for dma_release_from_contiguous Commit 90ae409f9eb3 ("dma-direct: fix zone selection after an unaddressable CMA allocation") changed the logic in dma_release_from_contiguous to remove the normal pages fallback path, but did not update the comment. Fix that. Signed-off-by: Peter Collingbourne Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8bc6f2d670f9..15bc5026c485 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -222,8 +222,8 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, * @gfp: Allocation flags. * * This function allocates contiguous memory buffer for specified device. It - * first tries to use device specific contiguous memory area if available or - * the default global one, then tries a fallback allocation of normal pages. + * tries to use device specific contiguous memory area if available, or the + * default global one. * * Note that it byapss one-page size of allocations from the global area as * the addresses within one page are always contiguous, so there is no need -- cgit v1.2.3 From ce5155c4f8226f4816f9f7cb88149ec275c7b0a8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 18 Feb 2020 22:43:15 -0500 Subject: compat sysinfo(2): don't bother with field-by-field copyout Signed-off-by: Al Viro --- kernel/sys.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..b4a0324a0699 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2634,6 +2634,7 @@ struct compat_sysinfo { COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; + struct compat_sysinfo s_32; do_sysinfo(&s); @@ -2658,23 +2659,23 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) s.freehigh >>= bitcount; } - if (!access_ok(info, sizeof(struct compat_sysinfo)) || - __put_user(s.uptime, &info->uptime) || - __put_user(s.loads[0], &info->loads[0]) || - __put_user(s.loads[1], &info->loads[1]) || - __put_user(s.loads[2], &info->loads[2]) || - __put_user(s.totalram, &info->totalram) || - __put_user(s.freeram, &info->freeram) || - __put_user(s.sharedram, &info->sharedram) || - __put_user(s.bufferram, &info->bufferram) || - __put_user(s.totalswap, &info->totalswap) || - __put_user(s.freeswap, &info->freeswap) || - __put_user(s.procs, &info->procs) || - __put_user(s.totalhigh, &info->totalhigh) || - __put_user(s.freehigh, &info->freehigh) || - __put_user(s.mem_unit, &info->mem_unit)) + memset(&s_32, 0, sizeof(s_32)); + s_32.uptime = s.uptime; + s_32.loads[0] = s.loads[0]; + s_32.loads[1] = s.loads[1]; + s_32.loads[2] = s.loads[2]; + s_32.totalram = s.totalram; + s_32.freeram = s.freeram; + s_32.sharedram = s.sharedram; + s_32.bufferram = s.bufferram; + s_32.totalswap = s.totalswap; + s_32.freeswap = s.freeswap; + s_32.procs = s.procs; + s_32.totalhigh = s.totalhigh; + s_32.freehigh = s.freehigh; + s_32.mem_unit = s.mem_unit; + if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; - return 0; } #endif /* CONFIG_COMPAT */ -- cgit v1.2.3 From b0b3fb6759220d4fa359e9ac486859c9d422c204 Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Sat, 18 Apr 2020 09:37:35 +0800 Subject: bpf: Remove set but not used variable 'dst_known' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes gcc '-Wunused-but-set-variable' warning: kernel/bpf/verifier.c:5603:18: warning: variable ‘dst_known’ set but not used [-Wunused-but-set-variable], delete this variable. Signed-off-by: Mao Wenan Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200418013735.67882-1-maowenan@huawei.com --- kernel/bpf/verifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fa1d8245b925..15ba8bf92ca9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5609,7 +5609,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, { struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); - bool src_known, dst_known; + bool src_known; s64 smin_val, smax_val; u64 umin_val, umax_val; s32 s32_min_val, s32_max_val; @@ -5631,7 +5631,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (alu32) { src_known = tnum_subreg_is_const(src_reg.var_off); - dst_known = tnum_subreg_is_const(dst_reg->var_off); if ((src_known && (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || s32_min_val > s32_max_val || u32_min_val > u32_max_val) { @@ -5643,7 +5642,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, } } else { src_known = tnum_is_const(src_reg.var_off); - dst_known = tnum_is_const(dst_reg->var_off); if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || smin_val > smax_val || umin_val > umax_val) { -- cgit v1.2.3 From 0456ea170cd665ddbb9503be92e39f96055dd5fa Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 20 Apr 2020 10:46:10 -0700 Subject: bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT} Currently the following prog types don't fall back to bpf_base_func_proto() (instead they have cgroup_base_func_proto which has a limited set of helpers from bpf_base_func_proto): * BPF_PROG_TYPE_CGROUP_DEVICE * BPF_PROG_TYPE_CGROUP_SYSCTL * BPF_PROG_TYPE_CGROUP_SOCKOPT I don't see any specific reason why we shouldn't use bpf_base_func_proto(), every other type of program (except bpf-lirc and, understandably, tracing) use it, so let's fall back to bpf_base_func_proto for those prog types as well. This basically boils down to adding access to the following helpers: * BPF_FUNC_get_prandom_u32 * BPF_FUNC_get_smp_processor_id * BPF_FUNC_get_numa_node_id * BPF_FUNC_tail_call * BPF_FUNC_ktime_get_ns * BPF_FUNC_spin_lock (CAP_SYS_ADMIN) * BPF_FUNC_spin_unlock (CAP_SYS_ADMIN) * BPF_FUNC_jiffies64 (CAP_SYS_ADMIN) I've also added bpf_perf_event_output() because it's really handy for logging and debugging. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200420174610.77494-1-sdf@google.com --- kernel/bpf/cgroup.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..4d748c5785bc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1060,30 +1060,16 @@ static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } -- cgit v1.2.3 From 6890896bd765b0504761c61901c9804fca23bfb2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 24 Apr 2020 16:59:41 -0700 Subject: bpf: Fix missing bpf_base_func_proto in cgroup_base_func_proto for CGROUP_NET=n linux-next build bot reported compile issue [1] with one of its configs. It looks like when we have CONFIG_NET=n and CONFIG_BPF{,_SYSCALL}=y, we are missing the bpf_base_func_proto definition (from net/core/filter.c) in cgroup_base_func_proto. I'm reshuffling the code a bit to make it work. The common helpers are moved into kernel/bpf/helpers.c and the bpf_base_func_proto is exported from there. Also, bpf_get_raw_cpu_id goes into kernel/bpf/core.c akin to existing bpf_user_rnd_u32. [1] https://lore.kernel.org/linux-next/CAKH8qBsBvKHswiX1nx40LgO+BGeTmb1NX8tiTttt_0uu6T3dCA@mail.gmail.com/T/#mff8b0c083314c68c2e2ef0211cb11bc20dc13c72 Fixes: 0456ea170cd6 ("bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT}") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200424235941.58382-1-sdf@google.com --- kernel/bpf/core.c | 5 ++++ kernel/bpf/helpers.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..0cc91805069a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32) return res; } +BPF_CALL_0(bpf_get_raw_cpu_id) +{ + return raw_smp_processor_id(); +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bafc53ddd350..dbba4f41d508 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -562,3 +562,76 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; + +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = bpf_get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + default: + return NULL; + } +} -- cgit v1.2.3 From 082b57e3eb09810d357083cca5ee2df02c16aec9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Mon, 20 Apr 2020 11:47:50 -0700 Subject: net: bpf: Make bpf_ktime_get_ns() available to non GPL programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The entire implementation is in kernel/bpf/helpers.c: BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; and this was presumably marked GPL due to kernel/time/timekeeping.c: EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); and while that may make sense for kernel modules (although even that is doubtful), there is currently AFAICT no other source of time available to ebpf. Furthermore this is really just equivalent to clock_gettime(CLOCK_MONOTONIC) which is exposed to userspace (via vdso even to make it performant)... As such, I see no reason to keep the GPL restriction. (In the future I'd like to have access to time from Apache licensed ebpf code) Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dbba4f41d508..9a6b23387d02 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -151,7 +151,7 @@ BPF_CALL_0(bpf_ktime_get_ns) const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, - .gpl_only = true, + .gpl_only = false, .ret_type = RET_INTEGER, }; -- cgit v1.2.3 From 71d19214776e61b33da48f7c1b46e522c7f78221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Sun, 26 Apr 2020 09:15:25 -0700 Subject: bpf: add bpf_ktime_get_boot_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a device like a cellphone which is constantly suspending and resuming CLOCK_MONOTONIC is not particularly useful for keeping track of or reacting to external network events. Instead you want to use CLOCK_BOOTTIME. Hence add bpf_ktime_get_boot_ns() as a mirror of bpf_ktime_get_ns() based around CLOCK_BOOTTIME instead of CLOCK_MONOTONIC. Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ 3 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0cc91805069a..6aa11de67315 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9a6b23387d02..5c0290e0696e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -155,6 +155,18 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_boot_ns) +{ + /* NMI safe access to clock boottime */ + return ktime_get_boot_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { + .func = bpf_ktime_get_boot_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -615,6 +627,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; default: break; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..e875c95d3ced 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: -- cgit v1.2.3 From 6f8a57ccf8511724e6f48d732cb2940889789ab2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Apr 2020 12:58:50 -0700 Subject: bpf: Make verifier log more relevant by default To make BPF verifier verbose log more releavant and easier to use to debug verification failures, "pop" parts of log that were successfully verified. This has effect of leaving only verifier logs that correspond to code branches that lead to verification failure, which in practice should result in much shorter and more relevant verifier log dumps. This behavior is made the default behavior and can be overriden to do exhaustive logging by specifying BPF_LOG_LEVEL2 log level. Using BPF_LOG_LEVEL2 to disable this behavior is not ideal, because in some cases it's good to have BPF_LOG_LEVEL2 per-instruction register dump verbosity, but still have only relevant verifier branches logged. But for this patch, I didn't want to add any new flags. It might be worth-while to just rethink how BPF verifier logging is performed and requested and streamline it a bit. But this trimming of successfully verified branches seems to be useful and a good default behavior. To test this, I modified runqslower slightly to introduce read of uninitialized stack variable. Log (**truncated in the middle** to save many lines out of this commit message) BEFORE this change: ; int handle__sched_switch(u64 *ctx) 0: (bf) r6 = r1 ; struct task_struct *prev = (struct task_struct *)ctx[1]; 1: (79) r1 = *(u64 *)(r6 +8) func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct' 2: (b7) r2 = 0 ; struct event event = {}; 3: (7b) *(u64 *)(r10 -24) = r2 last_idx 3 first_idx 0 regs=4 stack=0 before 2: (b7) r2 = 0 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 ; if (prev->state == TASK_RUNNING) [ ... instruction dump from insn #7 through #50 are cut out ... ] 51: (b7) r2 = 16 52: (85) call bpf_get_current_comm#16 last_idx 52 first_idx 42 regs=4 stack=0 before 51: (b7) r2 = 16 ; bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, 53: (bf) r1 = r6 54: (18) r2 = 0xffff8881f3868800 56: (18) r3 = 0xffffffff 58: (bf) r4 = r7 59: (b7) r5 = 32 60: (85) call bpf_perf_event_output#25 last_idx 60 first_idx 53 regs=20 stack=0 before 59: (b7) r5 = 32 61: (bf) r2 = r10 ; event.pid = pid; 62: (07) r2 += -16 ; bpf_map_delete_elem(&start, &pid); 63: (18) r1 = 0xffff8881f3868000 65: (85) call bpf_map_delete_elem#3 ; } 66: (b7) r0 = 0 67: (95) exit from 44 to 66: safe from 34 to 66: safe from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; bpf_map_update_elem(&start, &pid, &ts, 0); 28: (bf) r2 = r10 ; 29: (07) r2 += -16 ; tsp = bpf_map_lookup_elem(&start, &pid); 30: (18) r1 = 0xffff8881f3868000 32: (85) call bpf_map_lookup_elem#1 invalid indirect read from stack off -16+0 size 4 processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4 Notice how there is a successful code path from instruction 0 through 67, few successfully verified jumps (44->66, 34->66), and only after that 11->28 jump plus error on instruction #32. AFTER this change (full verifier log, **no truncation**): ; int handle__sched_switch(u64 *ctx) 0: (bf) r6 = r1 ; struct task_struct *prev = (struct task_struct *)ctx[1]; 1: (79) r1 = *(u64 *)(r6 +8) func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct' 2: (b7) r2 = 0 ; struct event event = {}; 3: (7b) *(u64 *)(r10 -24) = r2 last_idx 3 first_idx 0 regs=4 stack=0 before 2: (b7) r2 = 0 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 ; if (prev->state == TASK_RUNNING) 7: (79) r2 = *(u64 *)(r1 +16) ; if (prev->state == TASK_RUNNING) 8: (55) if r2 != 0x0 goto pc+19 R1_w=ptr_task_struct(id=0,off=0,imm=0) R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; trace_enqueue(prev->tgid, prev->pid); 9: (61) r1 = *(u32 *)(r1 +1184) 10: (63) *(u32 *)(r10 -4) = r1 ; if (!pid || (targ_pid && targ_pid != pid)) 11: (15) if r1 == 0x0 goto pc+16 from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; bpf_map_update_elem(&start, &pid, &ts, 0); 28: (bf) r2 = r10 ; 29: (07) r2 += -16 ; tsp = bpf_map_lookup_elem(&start, &pid); 30: (18) r1 = 0xffff8881db3ce800 32: (85) call bpf_map_lookup_elem#1 invalid indirect read from stack off -16+0 size 4 processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4 Notice how in this case, there are 0-11 instructions + jump from 11 to 28 is recorded + 28-32 instructions with error on insn #32. test_verifier test runner was updated to specify BPF_LOG_LEVEL2 for VERBOSE_ACCEPT expected result due to potentially "incomplete" success verbose log at BPF_LOG_LEVEL1. On success, verbose log will only have a summary of number of processed instructions, etc, but no branch tracing log. Having just a last succesful branch tracing seemed weird and confusing. Having small and clean summary log in success case seems quite logical and nice, though. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200423195850.1259827-1-andriin@fb.com --- kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15ba8bf92ca9..91728e0f27eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,6 +168,8 @@ struct bpf_verifier_stack_elem { int insn_idx; int prev_insn_idx; struct bpf_verifier_stack_elem *next; + /* length of verifier log at the time this state was pushed on stack */ + u32 log_pos; }; #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 @@ -283,6 +285,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, log->ubuf = NULL; } +static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) +{ + char zero = 0; + + if (!bpf_verifier_log_needed(log)) + return; + + log->len_used = new_pos; + if (put_user(zero, log->ubuf + new_pos)) + log->ubuf = NULL; +} + /* log_level controls verbosity level of eBPF verifier. * bpf_verifier_log_write() is used to dump the verification trace to the log, * so the user can figure out what's wrong with the program @@ -846,7 +860,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, - int *insn_idx) + int *insn_idx, bool pop_log) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem, *head = env->head; @@ -860,6 +874,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (err) return err; } + if (pop_log) + bpf_vlog_reset(&env->log, head->log_pos); if (insn_idx) *insn_idx = head->insn_idx; if (prev_insn_idx) @@ -887,6 +903,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; + elem->log_pos = env->log.len_used; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -915,7 +932,7 @@ err: free_verifier_state(env->cur_state, true); env->cur_state = NULL; /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); return NULL; } @@ -8407,6 +8424,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; @@ -8683,7 +8701,7 @@ static int do_check(struct bpf_verifier_env *env) process_bpf_exit: update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, - &env->insn_idx); + &env->insn_idx, pop_log); if (err < 0) { if (err != -ENOENT) return err; @@ -10206,6 +10224,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) static int do_check_common(struct bpf_verifier_env *env, int subprog) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -10268,7 +10287,9 @@ out: free_verifier_state(env->cur_state, true); env->cur_state = NULL; } - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); + if (!ret && pop_log) + bpf_vlog_reset(&env->log, 0); free_states(env); if (ret) /* clean aux data in case subprog was rejected */ -- cgit v1.2.3 From 26363af5643490a817272e1cc6f1d3f1d550a699 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:35 +0200 Subject: mm: remove watermark_boost_factor_sysctl_handler watermark_boost_factor_sysctl_handler is just a pointless wrapper for proc_dointvec_minmax, so remove it and use proc_dointvec_minmax directly. Signed-off-by: Christoph Hellwig Acked-by: David Rientjes Signed-off-by: Al Viro --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..99d27acf4646 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1491,7 +1491,7 @@ static struct ctl_table vm_table[] = { .data = &watermark_boost_factor, .maxlen = sizeof(watermark_boost_factor), .mode = 0644, - .proc_handler = watermark_boost_factor_sysctl_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { -- cgit v1.2.3 From 2374c09b1c8a883bb9b4b2fc3756703eeb618f4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:36 +0200 Subject: sysctl: remove all extern declaration from sysctl.c Extern declarations in .c files are a bad style and can lead to mismatches. Use existing definitions in headers where they exist, and otherwise move the external declarations to suitable header files. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 45 +++------------------------------------------ 1 file changed, 3 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99d27acf4646..31b934865ebc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,6 +68,9 @@ #include #include #include +#include +#include +#include #include "../lib/kstrtox.h" @@ -103,22 +106,6 @@ #if defined(CONFIG_SYSCTL) -/* External variables not in a header file. */ -extern int suid_dumpable; -#ifdef CONFIG_COREDUMP -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; -#endif -extern int pid_max; -extern int pid_max_min, pid_max_max; -extern int percpu_pagelist_fraction; -extern int latencytop_enabled; -extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; -#ifndef CONFIG_MMU -extern int sysctl_nr_trim_pages; -#endif - /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; @@ -160,24 +147,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_INOTIFY_USER #include #endif -#ifdef CONFIG_SPARC -#endif - -#ifdef CONFIG_PARISC -extern int pwrsw_enabled; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW -extern int unaligned_enabled; -#endif - -#ifdef CONFIG_IA64 -extern int unaligned_dump_stack; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN -extern int no_unaligned_warning; -#endif #ifdef CONFIG_PROC_SYSCTL @@ -243,14 +212,6 @@ static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; static struct ctl_table debug_table[]; static struct ctl_table dev_table[]; -extern struct ctl_table random_table[]; -#ifdef CONFIG_EPOLL -extern struct ctl_table epoll_table[]; -#endif - -#ifdef CONFIG_FW_LOADER_USER_HELPER -extern struct ctl_table firmware_config_table[]; -#endif #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -- cgit v1.2.3 From f461d2dcd511c020a26d4d791fae595c65ed09b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:37 +0200 Subject: sysctl: avoid forward declarations Move the sysctl tables to the end of the file to avoid lots of pointless forward declarations. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 3573 +++++++++++++++++++++++++++---------------------------- 1 file changed, 1768 insertions(+), 1805 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 31b934865ebc..3fafca3ced98 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -176,79 +176,13 @@ enum sysctl_writes_mode { }; static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; - -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_COMPACTION -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos); -#endif -#endif - -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -#ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -static struct ctl_table kern_table[]; -static struct ctl_table vm_table[]; -static struct ctl_table fs_table[]; -static struct ctl_table debug_table[]; -static struct ctl_table dev_table[]; +#endif /* CONFIG_PROC_SYSCTL */ #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif -/* The default sysctl tables: */ - -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; - #ifdef CONFIG_SCHED_DEBUG static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ @@ -265,1676 +199,12 @@ static int min_extfrag_threshold; static int max_extfrag_threshold = 1000; #endif -static struct ctl_table kern_table[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHED_DEBUG - { - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_latency_ns", - .data = &sysctl_sched_latency, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_wakeup_granularity_ns", - .data = &sysctl_sched_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, -#ifdef CONFIG_SMP - { - .procname = "sched_tunable_scaling", - .data = &sysctl_sched_tunable_scaling, - .maxlen = sizeof(enum sched_tunable_scaling), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_tunable_scaling, - .extra2 = &max_sched_tunable_scaling, - }, - { - .procname = "sched_migration_cost_ns", - .data = &sysctl_sched_migration_cost, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_nr_migrate", - .data = &sysctl_sched_nr_migrate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHEDSTATS - { - .procname = "sched_schedstats", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_schedstats, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SCHEDSTATS */ -#endif /* CONFIG_SMP */ -#ifdef CONFIG_NUMA_BALANCING - { - .procname = "numa_balancing_scan_delay_ms", - .data = &sysctl_numa_balancing_scan_delay, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_min_ms", - .data = &sysctl_numa_balancing_scan_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_max_ms", - .data = &sysctl_numa_balancing_scan_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_size_mb", - .data = &sysctl_numa_balancing_scan_size, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "numa_balancing", - .data = NULL, /* filled in by handler */ - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_numa_balancing, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ - { - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rr_timeslice_ms", - .data = &sysctl_sched_rr_timeslice, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rr_handler, - }, -#ifdef CONFIG_UCLAMP_TASK - { - .procname = "sched_util_clamp_min", - .data = &sysctl_sched_uclamp_util_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_max", - .data = &sysctl_sched_uclamp_util_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, -#endif -#ifdef CONFIG_SCHED_AUTOGROUP - { - .procname = "sched_autogroup_enabled", - .data = &sysctl_sched_autogroup_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", - .data = &sysctl_sched_cfs_bandwidth_slice, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, -#endif -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", - .data = &sysctl_sched_energy_aware, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_energy_aware_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "panic", - .data = &panic_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_COREDUMP - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", - .maxlen = sizeof(long), - .mode = 0644, - .proc_handler = proc_taint, - }, - { - .procname = "sysctl_writes_strict", - .data = &sysctl_writes_strict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_latencytop, - }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "print-fatal-signals", - .data = &print_fatal_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SPARC - { - .procname = "reboot-cmd", - .data = reboot_command, - .maxlen = 256, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "stop-a", - .data = &stop_a_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "scons-poweroff", - .data = &scons_pwroff, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SPARC64 - { - .procname = "tsb-ratio", - .data = &sysctl_tsb_ratio, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_PARISC - { - .procname = "soft-power", - .data = &pwrsw_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW - { - .procname = "unaligned-trap", - .data = &unaligned_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif -#ifdef CONFIG_STACK_TRACER - { - .procname = "stack_tracer_enabled", - .data = &stack_tracer_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = stack_trace_sysctl, - }, -#endif -#ifdef CONFIG_TRACING - { - .procname = "ftrace_dump_on_oops", - .data = &ftrace_dump_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "traceoff_on_warning", - .data = &__disable_trace_on_warning, - .maxlen = sizeof(__disable_trace_on_warning), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tracepoint_printk", - .data = &tracepoint_printk, - .maxlen = sizeof(tracepoint_printk), - .mode = 0644, - .proc_handler = tracepoint_printk_sysctl, - }, -#endif -#ifdef CONFIG_KEXEC_CORE - { - .procname = "kexec_load_disabled", - .data = &kexec_load_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_MODULES - { - .procname = "modprobe", - .data = &modprobe_path, - .maxlen = KMOD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "modules_disabled", - .data = &modules_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_UEVENT_HELPER - { - .procname = "hotplug", - .data = &uevent_helper, - .maxlen = UEVENT_HELPER_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MAGIC_SYSRQ - { - .procname = "sysrq", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = sysrq_sysctl_handler, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "cad_pid", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0600, - .proc_handler = proc_do_cad_pid, - }, -#endif - { - .procname = "threads-max", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_max_threads, - }, - { - .procname = "random", - .mode = 0555, - .child = random_table, - }, - { - .procname = "usermodehelper", - .mode = 0555, - .child = usermodehelper_table, - }, -#ifdef CONFIG_FW_LOADER_USER_HELPER - { - .procname = "firmware_config", - .mode = 0555, - .child = firmware_config_table, - }, -#endif - { - .procname = "overflowuid", - .data = &overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_S390 - { - .procname = "userprocess_debug", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "pid_max", - .data = &pid_max, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, - { - .procname = "panic_on_oops", - .data = &panic_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_print", - .data = &panic_print, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &ten_thousand, - }, - { - .procname = "printk_devkmsg", - .data = devkmsg_log_str, - .maxlen = DEVKMSG_STR_MAX_SIZE, - .mode = 0644, - .proc_handler = devkmsg_sysctl_set_loglvl, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#endif - { - .procname = "ngroups_max", - .data = &ngroups_max, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "cap_last_cap", - .data = (void *)&cap_last_cap, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = &sixty, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = &sysctl_hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_DEBUG_STACKOVERFLOW - { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_MMU) - { - .procname = "randomize_va_space", - .data = &randomize_va_space, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN - { - .procname = "ignore-unaligned-usertrap", - .data = &no_unaligned_warning, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_IA64 - { - .procname = "unaligned-dump-stack", - .data = &unaligned_dump_stack, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DETECT_HUNG_TASK - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_check_interval_secs", - .data = &sysctl_hung_task_check_interval_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, - }, -#endif -#ifdef CONFIG_RT_MUTEXES - { - .procname = "max_lock_depth", - .data = &max_lock_depth, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#ifdef CONFIG_KEYS - { - .procname = "keys", - .mode = 0555, - .child = key_sysctls, - }, -#endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_proc_update_handler, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "perf_cpu_time_max_percent", - .data = &sysctl_perf_cpu_time_max_percent, - .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), - .mode = 0644, - .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "perf_event_max_stack", - .data = &sysctl_perf_event_max_stack, - .maxlen = sizeof(sysctl_perf_event_max_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &six_hundred_forty_kb, - }, - { - .procname = "perf_event_max_contexts_per_stack", - .data = &sysctl_perf_event_max_contexts_per_stack, - .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_thousand, - }, -#endif - { - .procname = "panic_on_warn", - .data = &panic_on_warn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = timer_migration_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_BPF_SYSCALL - { - .procname = "unprivileged_bpf_disabled", - .data = &sysctl_unprivileged_bpf_disabled, - .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "bpf_stats_enabled", - .data = &bpf_stats_enabled_key.key, - .maxlen = sizeof(bpf_stats_enabled_key), - .mode = 0644, - .proc_handler = proc_do_static_key, - }, -#endif -#if defined(CONFIG_TREE_RCU) - { - .procname = "panic_on_rcu_stall", - .data = &sysctl_panic_on_rcu_stall, - .maxlen = sizeof(sysctl_panic_on_rcu_stall), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = overcommit_ratio_handler, - }, - { - .procname = "overcommit_kbytes", - .data = &sysctl_overcommit_kbytes, - .maxlen = sizeof(sysctl_overcommit_kbytes), - .mode = 0644, - .proc_handler = overcommit_kbytes_handler, - }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, -#ifdef CONFIG_HUGETLB_PAGE - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_sysctl_handler, - }, -#ifdef CONFIG_NUMA - { - .procname = "nr_hugepages_mempolicy", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_mempolicy_sysctl_handler, - }, - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_overcommit_handler, - }, -#endif - { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = &four, - }, -#ifdef CONFIG_COMPACTION - { - .procname = "compact_memory", - .data = &sysctl_compact_memory, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = sysctl_compaction_handler, - }, - { - .procname = "extfrag_threshold", - .data = &sysctl_extfrag_threshold, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, - }, - { - .procname = "compact_unevictable_allowed", - .data = &sysctl_compact_unevictable_allowed, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_warn_RT_change, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - -#endif /* CONFIG_COMPACTION */ - { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_scale_factor", - .data = &watermark_scale_factor, - .maxlen = sizeof(watermark_scale_factor), - .mode = 0644, - .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, - }, - { - .procname = "percpu_pagelist_fraction", - .data = &percpu_pagelist_fraction, - .maxlen = sizeof(percpu_pagelist_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#else - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "block_dump", - .data = &block_dump, - .maxlen = sizeof(block_dump), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &node_reclaim_mode, - .maxlen = sizeof(node_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, -#endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "stat_refresh", - .data = NULL, - .maxlen = 0, - .mode = 0600, - .proc_handler = vmstat_refresh, - }, -#endif -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .procname = "vdso_enabled", -#ifdef CONFIG_X86_32 - .data = &vdso32_enabled, - .maxlen = sizeof(vdso32_enabled), -#else - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), -#endif - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - { - .procname = "memory_failure_early_kill", - .data = &sysctl_memory_failure_early_kill, - .maxlen = sizeof(sysctl_memory_failure_early_kill), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "memory_failure_recovery", - .data = &sysctl_memory_failure_recovery, - .maxlen = sizeof(sysctl_memory_failure_recovery), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, - .maxlen = sizeof(sysctl_user_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "admin_reserve_kbytes", - .data = &sysctl_admin_reserve_kbytes, - .maxlen = sizeof(sysctl_admin_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS - { - .procname = "mmap_rnd_bits", - .data = &mmap_rnd_bits, - .maxlen = sizeof(mmap_rnd_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_bits_min, - .extra2 = (void *)&mmap_rnd_bits_max, - }, -#endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS - { - .procname = "mmap_rnd_compat_bits", - .data = &mmap_rnd_compat_bits, - .maxlen = sizeof(mmap_rnd_compat_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_compat_bits_min, - .extra2 = (void *)&mmap_rnd_compat_bits_max, - }, -#endif -#ifdef CONFIG_USERFAULTFD - { - .procname = "unprivileged_userfaultfd", - .data = &sysctl_unprivileged_userfaultfd, - .maxlen = sizeof(sysctl_unprivileged_userfaultfd), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero_ul, - .extra2 = &long_max, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif -#endif - { - .procname = "protected_symlinks", - .data = &sysctl_protected_symlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_hardlinks", - .data = &sysctl_protected_hardlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_fifos", - .data = &sysctl_protected_fifos, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "protected_regular", - .data = &sysctl_protected_regular, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = sysctl_mount_point, - }, -#endif - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(pipe_max_size), - .mode = 0644, - .proc_handler = proc_dopipe_max_size, - }, - { - .procname = "pipe-user-pages-hard", - .data = &pipe_user_pages_hard, - .maxlen = sizeof(pipe_user_pages_hard), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "pipe-user-pages-soft", - .data = &pipe_user_pages_soft, - .maxlen = sizeof(pipe_user_pages_soft), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "mount-max", - .data = &sysctl_mount_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { } -}; - -static struct ctl_table debug_table[] = { -#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE - { - .procname = "exception-trace", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table dev_table[] = { - { } -}; - -int __init sysctl_init(void) -{ - struct ctl_table_header *hdr; - - hdr = register_sysctl_table(sysctl_base_table); - kmemleak_not_leak(hdr); - return 0; -} - -#endif /* CONFIG_SYSCTL */ - -/* - * /proc/sys support - */ - +#endif /* CONFIG_SYSCTL */ + +/* + * /proc/sys support + */ + #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, @@ -3301,101 +1571,1794 @@ int proc_dostring(struct ctl_table *table, int write, return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } -int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +#endif /* CONFIG_PROC_SYSCTL */ + +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - return -ENOSYS; + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (val) + static_key_enable(key); + else + static_key_disable(key); + } + mutex_unlock(&static_key_mutex); + return ret; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table kern_table[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_wakeup_granularity_ns", + .data = &sysctl_sched_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, +#ifdef CONFIG_SMP + { + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, + }, + { + .procname = "sched_migration_cost_ns", + .data = &sysctl_sched_migration_cost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_nr_migrate", + .data = &sysctl_sched_nr_migrate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "numa_balancing", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_balancing, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_COREDUMP + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_taint, + }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_LATENCYTOP + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_latencytop, + }, +#endif +#ifdef CONFIG_BLK_DEV_INITRD + { + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SPARC + { + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "scons-poweroff", + .data = &scons_pwroff, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SPARC64 + { + .procname = "tsb-ratio", + .data = &sysctl_tsb_ratio, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PARISC + { + .procname = "soft-power", + .data = &pwrsw_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW + { + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_FUNCTION_TRACER + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, +#endif +#ifdef CONFIG_STACK_TRACER + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +#endif +#ifdef CONFIG_TRACING + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +#endif +#ifdef CONFIG_KEXEC_CORE + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_MODULES + { + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_UEVENT_HELPER + { + .procname = "hotplug", + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#endif +#ifdef CONFIG_CHR_DEV_SG + { + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT + { + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .procname = "sysrq", + .data = NULL, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = sysrq_sysctl_handler, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .data = NULL, + .maxlen = sizeof (int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif + { + .procname = "threads-max", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_max_threads, + }, + { + .procname = "random", + .mode = 0555, + .child = random_table, + }, + { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, +#ifdef CONFIG_FW_LOADER_USER_HELPER + { + .procname = "firmware_config", + .mode = 0555, + .child = firmware_config_table, + }, +#endif + { + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_S390 + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "pid_max", + .data = &pid_max, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + { + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#if defined CONFIG_PRINTK + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &ten_thousand, + }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, +#endif + { + .procname = "ngroups_max", + .data = &ngroups_max, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "cap_last_cap", + .data = (void *)&cap_last_cap, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_LOCKUP_DETECTOR) + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog_thresh, + .extra1 = SYSCTL_ZERO, + .extra2 = &sixty, + }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { + .procname = "soft_watchdog", + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_X86) + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_MMU) + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN + { + .procname = "ignore-unaligned-usertrap", + .data = &no_unaligned_warning, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_IA64 + { + .procname = "unaligned-dump-stack", + .data = &unaligned_dump_stack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + }, +#endif +#ifdef CONFIG_RT_MUTEXES + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#ifdef CONFIG_KEYS + { + .procname = "keys", + .mode = 0555, + .child = key_sysctls, + }, +#endif +#ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_proc_update_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, +#endif + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_BPF_SYSCALL + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = proc_do_static_key, + }, +#endif +#if defined(CONFIG_TREE_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table vm_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, + }, + { + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = &one_ul, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = &dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirtytime_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, +#ifdef CONFIG_HUGETLB_PAGE + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + }, + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + }, +#endif + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = &four, + }, +#ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", + .data = &sysctl_compact_memory, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_extfrag_threshold, + .extra2 = &max_extfrag_threshold, + }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_warn_RT_change, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + +#endif /* CONFIG_COMPACTION */ + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = &one_thousand, + }, + { + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_MMU + { + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#else + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "block_dump", + .data = &block_dump, + .maxlen = sizeof(block_dump), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) + { + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, +#endif +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, +#endif +#ifdef CONFIG_MMU + { + .procname = "mmap_min_addr", + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = mmap_min_addr_handler, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, +#endif +#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ + (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) + { + .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), +#endif + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + { + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table fs_table[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &long_max, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_FILE_LOCKING + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DNOTIFY + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_AIO + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif /* CONFIG_AIO */ +#ifdef CONFIG_INOTIFY_USER + { + .procname = "inotify", + .mode = 0555, + .child = inotify_table, + }, +#endif +#ifdef CONFIG_EPOLL + { + .procname = "epoll", + .mode = 0555, + .child = epoll_table, + }, +#endif +#endif + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_coredump, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) + { + .procname = "binfmt_misc", + .mode = 0555, + .child = sysctl_mount_point, + }, +#endif + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(pipe_max_size), + .mode = 0644, + .proc_handler = proc_dopipe_max_size, + }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { } +}; -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table debug_table[] = { +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE + { + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table dev_table[] = { + { } +}; -#endif /* CONFIG_PROC_SYSCTL */ +static struct ctl_table sysctl_base_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { } +}; -#if defined(CONFIG_SYSCTL) -int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int __init sysctl_init(void) { - struct static_key *key = (struct static_key *)table->data; - static DEFINE_MUTEX(static_key_mutex); - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; + struct ctl_table_header *hdr; - mutex_lock(&static_key_mutex); - val = static_key_enabled(key); - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (val) - static_key_enable(key); - else - static_key_disable(key); - } - mutex_unlock(&static_key_mutex); - return ret; + hdr = register_sysctl_table(sysctl_base_table); + kmemleak_not_leak(hdr); + return 0; } -#endif +#endif /* CONFIG_SYSCTL */ /* * No sense putting this after each symbol definition, twice, * exception granted :-) -- cgit v1.2.3 From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:38 +0200 Subject: sysctl: pass kernel pointers to ->proc_handler Instead of having all the sysctl handlers deal with user pointers, which is rather hairy in terms of the BPF interaction, copy the input to and from userspace in common code. This also means that the strings are always NUL-terminated by the common code, making the API a little bit safer. As most handler just pass through the data to one of the common handlers a lot of the changes are mechnical. Signed-off-by: Christoph Hellwig Acked-by: Andrey Ignatov Signed-off-by: Al Viro --- kernel/bpf/cgroup.c | 35 +++---- kernel/events/callchain.c | 2 +- kernel/events/core.c | 6 +- kernel/kprobes.c | 2 +- kernel/latencytop.c | 4 +- kernel/pid_namespace.c | 2 +- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 9 +- kernel/sched/fair.c | 3 +- kernel/sched/rt.c | 10 +- kernel/sched/topology.c | 2 +- kernel/seccomp.c | 2 +- kernel/sysctl.c | 239 ++++++++++++++++++---------------------------- kernel/time/timer.c | 3 +- kernel/trace/trace.c | 2 +- kernel/umh.c | 2 +- kernel/utsname_sysctl.c | 2 +- kernel/watchdog.c | 12 +-- 18 files changed, 132 insertions(+), 207 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..977bc69bb1c5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1137,16 +1137,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1154,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1169,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1201,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..bdb1533ada81 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -236,7 +236,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..f86d46f2c4d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..ffbe03a45c16 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..3ccaba5f15c0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..471f649b5868 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5c589a2e4d19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; int old_min, old_max; @@ -2723,7 +2722,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2797,8 +2796,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..b6077fd5b32f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..45da29de3ecc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2754,9 +2753,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..fa64b2ee9fe6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -209,7 +209,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..d653d8426de9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3fafca3ced98..e961286d0e14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -208,12 +208,10 @@ static int max_extfrag_threshold = 1000; #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) + char *buffer, size_t *lenp, loff_t *ppos) { size_t len; - char __user *p; - char c; + char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; @@ -238,8 +236,7 @@ static int _proc_do_string(char *data, int maxlen, int write, *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; + c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; @@ -261,11 +258,9 @@ static int _proc_do_string(char *data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; + memcpy(buffer, data, len); if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; + buffer[len] = '\n'; len++; } *lenp = len; @@ -326,13 +321,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * Returns 0 on success. */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); } static size_t proc_skip_spaces(char **buf) @@ -463,11 +458,10 @@ static int proc_get_long(char **buf, size_t *size, * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. + * In case of success @buf and @size are updated with the amount of bytes + * written. */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; @@ -476,24 +470,22 @@ static int proc_put_long(void __user **buf, size_t *size, unsigned long val, len = strlen(tmp); if (len > *size) len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; + memcpy(*buf, tmp, len); *size -= len; *buf += len; - return 0; } #undef TMPBUFLEN -static int proc_put_char(void __user **buf, size_t *size, char c) +static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; *buf = *buffer; } - return 0; } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, @@ -541,7 +533,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -549,7 +541,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, { int *i, vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -569,9 +561,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first=0) { @@ -598,24 +588,17 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, break; } if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err && left) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -623,7 +606,7 @@ out: } static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -634,7 +617,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -645,7 +628,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, int err = 0; size_t left; bool neg; - char *kbuf = NULL, *p; + char *p = buffer; left = *lenp; @@ -655,10 +638,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - left -= proc_skip_spaces(&p); if (!left) { err = -EINVAL; @@ -682,7 +661,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, left -= proc_skip_spaces(&p); out_free: - kfree(kbuf); if (err) return -EINVAL; @@ -694,7 +672,7 @@ bail_early: return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -712,11 +690,11 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, goto out; } - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) + proc_put_long(&buffer, &left, lval, false); + if (!left) goto out; - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; @@ -726,7 +704,7 @@ out: } static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -762,7 +740,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, } static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), @@ -785,16 +763,15 @@ static int do_proc_douintvec(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } #ifdef CONFIG_COMPACTION static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) + int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, old; @@ -826,8 +803,8 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, * * Returns 0 on success. */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); @@ -838,7 +815,7 @@ int proc_douintvec(struct ctl_table *table, int write, * This means we can safely use a temporary. */ static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long tmptaint = get_taint(); @@ -870,7 +847,7 @@ static int proc_taint(struct ctl_table *table, int write, #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -936,7 +913,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, @@ -1005,7 +982,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, @@ -1036,7 +1013,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, } static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); @@ -1057,7 +1034,7 @@ static void validate_coredump_safety(void) } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) @@ -1067,7 +1044,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dostring(table, write, buffer, lenp, ppos); if (!error) @@ -1078,7 +1055,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int tmp, ret; @@ -1096,16 +1073,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, } #endif -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -1124,9 +1099,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first = 0) { @@ -1154,26 +1127,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -1181,10 +1146,8 @@ out: } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); @@ -1207,7 +1170,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } @@ -1230,8 +1193,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); @@ -1325,7 +1287,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success. */ int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); @@ -1347,7 +1309,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); @@ -1369,15 +1331,15 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct pid *new_pid; pid_t tmp; @@ -1416,7 +1378,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; bool first = 1; @@ -1432,7 +1394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - char *kbuf, *p; + char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { @@ -1441,15 +1403,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, skipped = *lenp - left; } - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); + if (!tmp_bitmap) return -ENOMEM; - } proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; @@ -1513,7 +1469,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, first = 0; proc_skip_char(&p, &left, '\n'); } - kfree(kbuf); left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -1525,27 +1480,17 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); } if (!err) { @@ -1566,68 +1511,67 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } @@ -1636,8 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #if defined(CONFIG_SYSCTL) int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..398e6eadb861 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -249,8 +249,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..167a74a15b1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..9788ed481a6a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..53ff2c81b084 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -661,7 +661,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; -- cgit v1.2.3 From 23b5ae2e8e1326c91b5dfdbb6ebcd5a6820074ae Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 17 Apr 2020 22:50:31 +0800 Subject: locking/rtmutex: Remove unused rt_mutex_cmpxchg_relaxed() Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1587135032-188866-1-git-send-email-alex.shi@linux.alibaba.com --- kernel/locking/rtmutex.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c9f090d64f00..cfdd5b93264d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -141,7 +141,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) @@ -202,7 +201,6 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) # define rt_mutex_cmpxchg_acquire(l,c,n) (0) # define rt_mutex_cmpxchg_release(l,c,n) (0) -- cgit v1.2.3 From 353159365e725fab7b3bf2817bec0ecd16706a38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2020 16:36:29 -0700 Subject: rcu: Add KCSAN stubs This commit adds stubs for KCSAN's data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros to allow code using these macros to move ahead. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9a49cd6065a..156ac8d0418b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -67,6 +67,19 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Data structures. */ /* -- cgit v1.2.3 From 4f58820fd710e0563e22420c07c03c5ccec948bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2020 16:33:01 -0700 Subject: srcu: Add KCSAN stubs This commit adds stubs for KCSAN's data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros to allow code using these macros to move ahead. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0c71505f0e19..ba2b751e9168 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,6 +29,19 @@ #include "rcu.h" #include "rcu_segcblist.h" +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; -- cgit v1.2.3 From 2f08469563550d15cb08a60898d3549720600eee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Feb 2020 05:29:58 -0800 Subject: rcu: Mark rcu_state.ncpus to detect concurrent writes The rcu_state structure's ncpus field is only to be modified by the CPU-hotplug CPU-online code path, which is single-threaded. This commit therefore enlists KCSAN's help in enforcing this restriction. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 156ac8d0418b..c88240a685ec 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3612,6 +3612,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); -- cgit v1.2.3 From 314eeb43e5f22856b281c91c966e51e5782a3498 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 14:18:12 -0800 Subject: rcu: Add *_ONCE() and data_race() to rcu_node ->exp_tasks plus locking There are lockless loads from the rcu_node structure's ->exp_tasks field, so this commit causes all stores to use WRITE_ONCE() and all lockless loads to use READ_ONCE() or data_race(), with the latter for debug prints. This code also did a unprotected traversal of the linked list pointed into by ->exp_tasks, so this commit also acquires the rcu_node structure's ->lock to properly protect this traversal. This list was traversed unprotected only when printing an RCU CPU stall warning for an expedited grace period, so the odds of seeing this in production are not all that high. This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 19 +++++++++++-------- kernel/rcu/tree_plugin.h | 8 ++++---- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..c2b04daf1190 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -150,7 +150,7 @@ static void __maybe_unused sync_exp_reset_tree(void) static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -373,7 +373,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) * until such time as the ->expmask bits are cleared. */ if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } @@ -721,17 +721,20 @@ static void sync_sched_exp_online_cleanup(int cpu) */ static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - struct task_struct *t; + unsigned long flags; int ndetected = 0; + struct task_struct *t; - if (!rnp->exp_tasks) + if (!READ_ONCE(rnp->exp_tasks)) return 0; + raw_spin_lock_irqsave_rcu_node(rnp, flags); t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); ndetected++; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ndetected; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 097635c41135..35d77db035bd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -226,7 +226,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -500,7 +500,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (&t->rcu_node_entry == rnp->gp_tasks) WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; @@ -761,7 +761,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, - rnp->exp_tasks); + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -1036,7 +1036,7 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(rnp->boost_tasks || READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); -- cgit v1.2.3 From 065a6db12a80fac4eccc37e6bef14b0f4a610f31 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:22:01 -0800 Subject: rcu: Add READ_ONCE and data_race() to rcu_node ->boost_tasks The rcu_node structure's ->boost_tasks field is read locklessly, so this commit adds the READ_ONCE() to one load in order to avoid destructive compiler optimizations. The other load is from a diagnostic print, so data_race() suffices. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 35d77db035bd..ed6bb460c7c6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -760,7 +760,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, + __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; @@ -1036,7 +1036,8 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || READ_ONCE(rnp->exp_tasks)); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); -- cgit v1.2.3 From b68c6146512d92f6d570d26e1873497ade2cc4cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 16:36:59 -0800 Subject: srcu: Add data_race() to ->srcu_lock_count and ->srcu_unlock_count arrays The srcu_data structure's ->srcu_lock_count and ->srcu_unlock_count arrays are read and written locklessly, so this commit adds the data_race() to the diagnostic-print loads from these arrays in order mark them as known and approved data-racy accesses. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being used only by rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ba2b751e9168..6d3ef700fb0e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1281,8 +1281,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); /* * Make sure that a lock is always counted if the corresponding @@ -1290,8 +1290,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); c0 = l0 - u0; c1 = l1 - u1; -- cgit v1.2.3 From 5822b8126ff01e0baaf7d5168adc4ac8aeae088c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 4 Jan 2020 10:44:41 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_node ->boost_tasks The rcu_node structure's ->boost_tasks field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ed6bb460c7c6..664d0aa48edb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -505,7 +505,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -1082,7 +1082,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->qsmask == 0 && (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, READ_ONCE(rnp->boost_kthread_status)); -- cgit v1.2.3 From 47fbb074536ecca5e0af3dcce340954c09ed4d1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Feb 2020 02:29:36 -0800 Subject: rcu: Use data_race() for RCU CPU stall-warning prints Although the accesses used to determine whether or not a stall should be printed are an integral part of the concurrency algorithm governing use of the corresponding variables, the values that are simply printed are ancillary. As such, it is best to use data_race() for these accesses in order to provide the greatest latitude in the use of KCSAN for the other accesses that are an integral part of the algorithm. This commit therefore changes the relevant uses of READ_ONCE() to data_race(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..e7da1111ab0c 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -360,7 +360,7 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - READ_ONCE(rcu_state.gp_flags), + data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { @@ -421,10 +421,10 @@ static void print_other_cpu_stall(unsigned long gp_seq) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); + gpa = data_race(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), + data_race(jiffies_till_next_fqs), rcu_get_root()->qsmask); /* In this case, the current CPU might be at fault. */ sched_show_task(current); @@ -581,23 +581,23 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - READ_ONCE(rcu_state.gp_activity); - jr = j - READ_ONCE(rcu_state.gp_req_activity); - jw = j - READ_ONCE(rcu_state.gp_wake_time); + ja = j - data_race(rcu_state.gp_activity); + jr = j - data_race(rcu_state.gp_req_activity); + jw = j - data_race(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, t ? t->state : 0x1ffffL, - ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), - (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rcu_get_root()->gp_seq_needed), - READ_ONCE(rcu_state.gp_flags)); + ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), + (long)data_race(rcu_state.gp_seq), + (long)data_race(rcu_get_root()->gp_seq_needed), + data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), - (long)READ_ONCE(rnp->gp_seq_needed)); + rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq), + (long)data_race(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -607,7 +607,7 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)READ_ONCE(rdp->gp_seq_needed)); + cpu, (long)data_race(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { -- cgit v1.2.3 From 1fca4d12f46371a34bf90af87f49548dd026c3ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Feb 2020 20:07:09 -0800 Subject: rcu: Expedite first two FQS scans under callback-overload conditions Even if some CPUs have excessive numbers of callbacks, RCU's grace-period kthread will still wait normally between successive force-quiescent-state scans. The first two are the most important, as they are the ones that enlist aid from the scheduler when overloaded. This commit therefore omits the wait before the first and the second force-quiescent-state scan under callback-overload conditions. This approach was inspired by a discussion with Jeff Roberson. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 19 +++++++++++++++---- kernel/rcu/tree.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c88240a685ec..0132bc6152dd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1624,12 +1624,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp) { struct rcu_node *rnp = rcu_get_root(); - /* Someone like call_rcu() requested a force-quiescent-state scan. */ + // If under overload conditions, force an immediate FQS scan. + if (*gfp & RCU_GP_FLAG_OVLD) + return true; + + // Someone like call_rcu() requested a force-quiescent-state scan. *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; - /* The current grace period has completed. */ + // The current grace period has completed. if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) return true; @@ -1667,13 +1671,15 @@ static void rcu_gp_fqs(bool first_time) static void rcu_gp_fqs_loop(void) { bool first_gp_fqs; - int gf; + int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); + if (rcu_state.cbovld) + gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { if (!ret) { @@ -1698,7 +1704,11 @@ static void rcu_gp_fqs_loop(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); - first_gp_fqs = false; + gf = 0; + if (first_gp_fqs) { + first_gp_fqs = false; + gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; + } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); @@ -1718,6 +1728,7 @@ static void rcu_gp_fqs_loop(void) j = 1; else j = rcu_state.jiffies_force_qs - j; + gf = 0; } } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9dc2ec021da5..44edd0a98ffe 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -359,6 +359,7 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */ /* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ -- cgit v1.2.3 From fcbcc0e700500fcecf24b9b705825135de30415e Mon Sep 17 00:00:00 2001 From: Zhaolong Zhang Date: Thu, 5 Mar 2020 14:56:11 -0800 Subject: rcu: Fix the (t=0 jiffies) false positive It is possible that an over-long grace period will end while the RCU CPU stall warning message is printing. In this case, the estimate of the offending grace period's duration can be erroneous due to refetching of rcu_state.gp_start, which will now be the time of the newly started grace period. Computation of this duration clearly needs to use the start time for the old over-long grace period, not the fresh new one. This commit avoids such errors by causing both print_other_cpu_stall() and print_cpu_stall() to reuse the value previously fetched by their caller. Signed-off-by: Zhaolong Zhang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index e7da1111ab0c..3a7bc99e78e3 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -371,7 +371,7 @@ static void rcu_check_gp_kthread_starvation(void) } } -static void print_other_cpu_stall(unsigned long gp_seq) +static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -408,7 +408,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + smp_processor_id(), (long)(jiffies - gps), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(); @@ -442,7 +442,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(void) +static void print_cpu_stall(unsigned long gps) { int cpu; unsigned long flags; @@ -467,7 +467,7 @@ static void print_cpu_stall(void) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, + jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); rcu_check_gp_kthread_starvation(); @@ -546,7 +546,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); + print_cpu_stall(gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); @@ -555,7 +555,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); + print_other_cpu_stall(gs2, gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); } -- cgit v1.2.3 From c28d5c09d09f86374a00b70a57d3cb75e3fc7fa9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 17 Mar 2020 15:54:18 +0100 Subject: rcu: Get rid of some doc warnings in update.c This commit escapes *ret, because otherwise the documentation system thinks that this is an incomplete emphasis block: ./kernel/rcu/update.c:65: WARNING: Inline emphasis start-string without end-string. ./kernel/rcu/update.c:65: WARNING: Inline emphasis start-string without end-string. ./kernel/rcu/update.c:70: WARNING: Inline emphasis start-string without end-string. ./kernel/rcu/update.c:82: WARNING: Inline emphasis start-string without end-string. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 28a8bdc5072f..72461dd80d29 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -63,12 +63,12 @@ module_param(rcu_normal_after_boot, int, 0); * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? * @ret: Best guess answer if lockdep cannot be relied on * - * Returns true if lockdep must be ignored, in which case *ret contains + * Returns true if lockdep must be ignored, in which case ``*ret`` contains * the best guess described below. Otherwise returns false, in which - * case *ret tells the caller nothing and the caller should instead + * case ``*ret`` tells the caller nothing and the caller should instead * consult lockdep. * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set ``*ret`` to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -82,7 +82,7 @@ module_param(rcu_normal_after_boot, int, 0); * * Note that if the CPU is in the idle loop from an RCU point of view (ie: * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) - * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are * in such a section, considering these as in extended quiescent state, * so such a CPU is effectively never in an RCU read-side critical section -- cgit v1.2.3 From 62ae19511f1efbe9f57346ca1f45e13b061a56ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Mar 2020 19:52:20 -0700 Subject: rcu: Mark rcu_state.gp_seq to detect more concurrent writes The rcu_state structure's gp_seq field is only to be modified by the RCU grace-period kthread, which is single-threaded. This commit therefore enlists KCSAN's help in enforcing this restriction. This commit applies KCSAN-specific primitives, so cannot go upstream until KCSAN does. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0132bc6152dd..183b9cf33f50 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1230,7 +1230,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1519,6 +1519,7 @@ static bool rcu_gp_init(void) record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1805,6 +1806,7 @@ static void rcu_gp_cleanup(void) /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); -- cgit v1.2.3 From a66dbda7893f48b97d7406ae42fa29190aa672a0 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Fri, 27 Mar 2020 21:23:53 +0000 Subject: rcu: Replace assigned pointer ret value by corresponding boolean value Coccinelle reports warnings at rcu_read_lock_held_common() WARNING: Assignment of 0/1 to bool variable To fix this, the assigned pointer ret values are replaced by corresponding boolean value. Given that ret is a pointer of bool type Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 72461dd80d29..17f23569e21a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -98,15 +98,15 @@ module_param(rcu_normal_after_boot, int, 0); static bool rcu_read_lock_held_common(bool *ret) { if (!debug_lockdep_rcu_enabled()) { - *ret = 1; + *ret = true; return true; } if (!rcu_is_watching()) { - *ret = 0; + *ret = false; return true; } if (!rcu_lockdep_current_cpu_online()) { - *ret = 0; + *ret = false; return true; } return false; -- cgit v1.2.3 From da44cd6c8e88b6da3d5277d0e7b0e4d38faf4532 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 30 Mar 2020 02:24:48 +0100 Subject: rcu: Replace 1 by true Coccinelle reports a warning at use_softirq declaration WARNING: Assignment of 0/1 to bool variable The root cause is use_softirq a variable of bool type is initialised with the integer 1 Replacing 1 with value true solve the issue. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 183b9cf33f50..940c62acd14a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -113,7 +113,7 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = 1; +static bool use_softirq = true; module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; -- cgit v1.2.3 From 29ffebc5fcc0bcd8e9bc9f9b3899f2d222b12b04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 14:48:20 -0700 Subject: rcu: Convert ULONG_CMP_GE() to time_after() for jiffy comparison This commit converts the ULONG_CMP_GE() in rcu_gp_fqs_loop() to time_after() to reflect the fact that it is comparing a timestamp to the jiffies counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 940c62acd14a..e5903669e336 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1700,7 +1700,7 @@ static void rcu_gp_fqs_loop(void) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + if (!time_after(rcu_state.jiffies_force_qs, jiffies) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); -- cgit v1.2.3 From 7b2413111a630469282c427033818977515ea592 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 15:52:53 -0700 Subject: rcu: Convert rcu_initiate_boost() ULONG_CMP_GE() to time_after() This commit converts the ULONG_CMP_GE() in rcu_initiate_boost() to time_after() to reflect the fact that it is comparing a timestamp to the jiffies counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 664d0aa48edb..5cd27c2916c0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1080,7 +1080,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit v1.2.3 From e2f3ccfa62001994ed3e81c309face75aaa8d372 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 17:05:22 -0700 Subject: rcu: Convert rcu_nohz_full_cpu() ULONG_CMP_LT() to time_before() This commit converts the ULONG_CMP_LT() in rcu_nohz_full_cpu() to time_before() to reflect the fact that it is comparing a timestamp to the jiffies counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5cd27c2916c0..5771e32a3840 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2537,7 +2537,7 @@ static bool rcu_nohz_full_cpu(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress() || - ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; -- cgit v1.2.3 From f87dc808009ac86c790031627698ef1a34c31e25 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 16 Mar 2020 12:32:26 -0400 Subject: rcuperf: Add ability to increase object allocation size This allows us to increase memory pressure dynamically using a new rcuperf boot command line parameter called 'rcumult'. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a8d097d84d..16dd1e6b7c09 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,6 +88,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -635,7 +636,7 @@ kfree_perf_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -722,6 +723,8 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } + pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); if (kfree_reader_tasks == NULL) { -- cgit v1.2.3 From 9154244c1ab6c9db4f1f25ac8f73bd46dba64287 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 16 Mar 2020 12:32:27 -0400 Subject: rcu/tree: Add a shrinker to prevent OOM due to kfree_rcu() batching To reduce grace periods and improve kfree() performance, we have done batching recently dramatically bringing down the number of grace periods while giving us the ability to use kfree_bulk() for efficient kfree'ing. However, this has increased the likelihood of OOM condition under heavy kfree_rcu() flood on small memory systems. This patch introduces a shrinker which starts grace periods right away if the system is under memory pressure due to existence of objects that have still not started a grace period. With this patch, I do not observe an OOM anymore on a system with 512MB RAM and 8 CPUs, with the following rcuperf options: rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000 rcuperf.kfree_rcu_test=1 rcuperf.kfree_mult=2 Otherwise it easily OOMs with the above parameters. NOTE: 1. On systems with no memory pressure, the patch has no effect as intended. 2. In the future, we can use this same mechanism to prevent grace periods from happening even more, by relying on shrinkers carefully. Cc: urezki@gmail.com Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 156ac8d0418b..e299cd0ddd97 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2824,6 +2824,8 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; + // Number of objects for which GP not started + int count; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); @@ -2937,6 +2939,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krcp->head = NULL; } + krcp->count = 0; + /* * One work is per one batch, so there are two "free channels", * "bhead_free" and "head_free" the batch can handle. It can be @@ -3073,6 +3077,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) krcp->head = head; } + krcp->count++; + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !krcp->monitor_todo) { @@ -3087,6 +3093,58 @@ unlock_return: } EXPORT_SYMBOL_GPL(kfree_call_rcu); +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long flags, count = 0; + + /* Snapshot count of all CPUs */ + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_irqsave(&krcp->lock, flags); + count += krcp->count; + spin_unlock_irqrestore(&krcp->lock, flags); + } + + return count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + unsigned long flags; + + for_each_online_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krcp->count; + spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed; +} + +static struct shrinker kfree_rcu_shrinker = { + .count_objects = kfree_rcu_shrink_count, + .scan_objects = kfree_rcu_shrink_scan, + .batch = 0, + .seeks = DEFAULT_SEEKS, +}; + void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -4007,6 +4065,8 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } + if (register_shrinker(&kfree_rcu_shrinker)) + pr_err("Failed to register kfree_rcu() shrinker!\n"); } void __init rcu_init(void) -- cgit v1.2.3 From a6a82ce18ba443186545d3fefbee8b9419a859dc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 16 Mar 2020 12:32:28 -0400 Subject: rcu/tree: Count number of batched kfree_rcu() locklessly We can relax the correctness of counting of number of queued objects in favor of not hurting performance, by locklessly sampling per-cpu counters. This should be Ok since under high memory pressure, it should not matter if we are off by a few objects while counting. The shrinker will still do the reclaim. Signed-off-by: Joel Fernandes (Google) [ paulmck: Remove unused "flags" variable. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e299cd0ddd97..3f1f57411fe1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2939,7 +2939,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krcp->head = NULL; } - krcp->count = 0; + WRITE_ONCE(krcp->count, 0); /* * One work is per one batch, so there are two "free channels", @@ -3077,7 +3077,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) krcp->head = head; } - krcp->count++; + WRITE_ONCE(krcp->count, krcp->count + 1); // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && @@ -3097,15 +3097,13 @@ static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { int cpu; - unsigned long flags, count = 0; + unsigned long count = 0; /* Snapshot count of all CPUs */ for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); - count += krcp->count; - spin_unlock_irqrestore(&krcp->lock, flags); + count += READ_ONCE(krcp->count); } return count; -- cgit v1.2.3 From 6be7436d2245d3dd8b9a8f949367c13841c23308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 13:47:41 -0700 Subject: rcu: Add rcu_gp_might_be_stalled() This commit adds rcu_gp_might_be_stalled(), which returns true if there is some reason to believe that the RCU grace period is stalled. The use case is where an RCU free-memory path needs to allocate memory in order to free it, a situation that should be avoided where possible. But where it is necessary, there is always the alternative of using synchronize_rcu() to wait for a grace period in order to avoid the allocation. And if the grace period is stalled, allocating memory to asynchronously wait for it is a bad idea of epic proportions: Far better to let others use the memory, because these others might actually be able to free that memory before the grace period ends. Thus, rcu_gp_might_be_stalled() can be used to help decide whether allocating memory on an RCU free path is a semi-reasonable course of action. Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..4dede00e2936 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } -- cgit v1.2.3 From c76e7e0bce10876e6b08ac2ce8af5ef7cba684ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Apr 2020 10:19:02 -0700 Subject: rcu: Add KCSAN stubs to update.c This commit adds stubs for KCSAN's data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros to allow code using these macros to move ahead. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 28a8bdc5072f..74a698aa9027 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,6 +51,19 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); -- cgit v1.2.3 From e4453d8a1c56050df320ef54f339ffa4a9513d0a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Feb 2020 14:18:09 -0800 Subject: rcu: Make rcu_read_unlock_special() safe for rq/pi locks The scheduler is currently required to hold rq/pi locks across the entire RCU read-side critical section or not at all. This is inconvenient and leaves traps for the unwary, including the author of this commit. But now that excessively long grace periods enable scheduling-clock interrupts for holdout nohz_full CPUs, the nohz_full rescue logic in rcu_read_unlock_special() can be dispensed with. In other words, the rcu_read_unlock_special() function can refrain from doing wakeups unless such wakeups are guaranteed safe. This commit therefore avoids unsafe wakeups, freeing the scheduler to hold rq/pi locks across rcu_read_unlock() even if the corresponding RCU read-side critical section might have been preempted. This commit also updates RCU's requirements documentation. This commit is inspired by a patch from Lai Jiangshan: https://lore.kernel.org/lkml/20191102124559.1135-2-laijs@linux.alibaba.com This commit is further intended to be a step towards his goal of permitting the inlining of RCU-preempt's rcu_read_lock() and rcu_read_unlock(). Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 097635c41135..ccad77639d80 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -615,19 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - tick_nohz_full_cpu(rdp->cpu); + exp = (t->rcu_blocked_node && + READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && - (in_interrupt() || - (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { - // Using softirq, safe to awaken, and we get - // no help from enabling irqs, unlike bh/preempt. + if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + // Using softirq, safe to awaken, and either the + // wakeup is free or there is an expedited GP. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting or NO_HZ_FULL, slow is OK. + // Also if no expediting, slow is OK. + // Plus nohz_full CPUs eventually get tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && -- cgit v1.2.3 From 07b4a930fc44a537efecf73c1fd2b4937f64caaa Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2020 14:37:26 -0800 Subject: rcu: Don't set nesting depth negative in rcu_preempt_deferred_qs() Now that RCU flavors have been consolidated, an RCU-preempt rcu_read_unlock() in an interrupt or softirq handler cannot possibly end the RCU read-side critical section. Consider the old vulnerability involving rcu_preempt_deferred_qs() being invoked within such a handler that interrupted an extended RCU read-side critical section, in which a wakeup might be invoked with a scheduler lock held. Because rcu_read_unlock_special() no longer does wakeups in such situations, it is no longer necessary for rcu_preempt_deferred_qs() to set the nesting level negative. This commit therefore removes this recursion-protection code from rcu_preempt_deferred_qs(). [ paulmck: Fix typo in commit log per Steve Rostedt. ] Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ccad77639d80..263c766b9dc1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -569,16 +569,11 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* -- cgit v1.2.3 From f0bdf6d473cf12a488a78422e15aafdfe77cf853 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2020 14:52:32 -0800 Subject: rcu: Remove unused ->rcu_read_unlock_special.b.deferred_qs field The ->rcu_read_unlock_special.b.deferred_qs field is set to true in rcu_read_unlock_special() but never set to false. This is not particularly useful, so this commit removes this field. The only possible justification for this field is to ease debugging of RCU deferred quiscent states, but the combination of the other ->rcu_read_unlock_special fields plus ->rcu_blocked_node and of course ->rcu_read_lock_nesting should cover debugging needs. And if this last proves incorrect, this patch can always be reverted, along with the required setting of ->rcu_read_unlock_special.b.deferred_qs to false in rcu_preempt_deferred_qs_irqrestore(). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 263c766b9dc1..f31c5992f842 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -634,7 +634,6 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } - t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } -- cgit v1.2.3 From 5f5fa7ea89dc82d34ed458f4d7a8634e8e9eefce Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2020 15:23:26 -0800 Subject: rcu: Don't use negative nesting depth in __rcu_read_unlock() Now that RCU flavors have been consolidated, an RCU-preempt rcu_read_unlock() in an interrupt or softirq handler cannot possibly end the RCU read-side critical section. Consider the old vulnerability involving rcu_read_unlock() being invoked within such a handler that interrupted an __rcu_read_unlock_special(), in which a wakeup might be invoked with a scheduler lock held. Because rcu_read_unlock_special() no longer does wakeups in such situations, it is no longer necessary for __rcu_read_unlock() to set the nesting level negative. This commit therefore removes this recursion-protection code from __rcu_read_unlock(). [ paulmck: Let rcu_exp_handler() continue to call rcu_report_exp_rdp(). ] [ paulmck: Adjust other checks given no more negative nesting. ] Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 31 +++++-------------------------- kernel/rcu/tree_plugin.h | 22 +++++++--------------- 2 files changed, 12 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..0e5ccb330f9d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -639,6 +639,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) */ static void rcu_exp_handler(void *unused) { + int depth = rcu_preempt_depth(); unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -649,7 +650,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!rcu_preempt_depth()) { + if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -673,7 +674,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (rcu_preempt_depth() > 0) { + if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; @@ -683,30 +684,8 @@ static void rcu_exp_handler(void *unused) return; } - /* - * The final and least likely case is where the interrupted - * code was just about to or just finished exiting the RCU-preempt - * read-side critical section, and no, we can't tell which. - * So either way, set ->deferred_qs to flag later code that - * a quiescent state is required. - * - * If the CPU is fully enabled (or if some buggy RCU-preempt - * read-side critical section is being used from idle), just - * invoke rcu_preempt_deferred_qs() to immediately report the - * quiescent state. We cannot use rcu_read_unlock_special() - * because we are in an interrupt handler, which will cause that - * function to take an early exit without doing anything. - * - * Otherwise, force a context switch after the CPU enables everything. - */ - rdp->exp_deferred_qs = true; - if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { - rcu_preempt_deferred_qs(t); - } else { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + // Finally, negative nesting depth should not happen. + WARN_ON_ONCE(1); } /* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f31c5992f842..088e84e4578f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -345,9 +345,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return READ_ONCE(rnp->gp_tasks) != NULL; } -/* Bias and limit values for ->rcu_read_lock_nesting. */ -#define RCU_NEST_BIAS INT_MAX -#define RCU_NEST_NMAX (-INT_MAX / 2) +/* limit value for ->rcu_read_lock_nesting. */ #define RCU_NEST_PMAX (INT_MAX / 2) static void rcu_preempt_read_enter(void) @@ -355,9 +353,9 @@ static void rcu_preempt_read_enter(void) current->rcu_read_lock_nesting++; } -static void rcu_preempt_read_exit(void) +static int rcu_preempt_read_exit(void) { - current->rcu_read_lock_nesting--; + return --current->rcu_read_lock_nesting; } static void rcu_preempt_depth_set(int val) @@ -390,21 +388,15 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (rcu_preempt_depth() != 1) { - rcu_preempt_read_exit(); - } else { + if (rcu_preempt_read_exit() == 0) { barrier(); /* critical section before exit code. */ - rcu_preempt_depth_set(-RCU_NEST_BIAS); - barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { int rrln = rcu_preempt_depth(); - WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); + WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); } } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -556,7 +548,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - rcu_preempt_depth() <= 0; + rcu_preempt_depth() == 0; } /* @@ -692,7 +684,7 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!rcu_preempt_depth()) { + } else if (!WARN_ON_ONCE(rcu_preempt_depth())) { rcu_qs(); /* Report immediate QS. */ return; } -- cgit v1.2.3 From 52b1fc3f798d02a3a9d1cf7a84e98a795223410a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 28 Mar 2020 18:53:25 -0700 Subject: rcutorture: Add test of holding scheduler locks across rcu_read_unlock() Now that it should be safe to hold scheduler locks across rcu_read_unlock(), even in cases where the corresponding RCU read-side critical section might have been preempted and boosted, the commit adds a test of this capability to rcutorture. This has been tested on current mainline (which can deadlock in this situation), and lockdep duly reported the expected deadlock. On -rcu, lockdep is silent, thus far, anyway. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5453bd557f43..b348cf816d89 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1147,6 +1147,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + unsigned long flags; int idxnew = -1; int idxold = *readstate; int statesnew = ~*readstate & newstate; @@ -1181,8 +1182,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_SCHED) rcu_read_unlock_sched(); - if (statesold & RCUTORTURE_RDR_RCU) + if (statesold & RCUTORTURE_RDR_RCU) { + bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + + if (lockit) + raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + if (lockit) + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) -- cgit v1.2.3 From ac3caf827488d3bc4d4101ff34931abbfa33839d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Mar 2020 17:01:57 -0700 Subject: rcu: Add comments marking transitions between RCU watching and not It is not as clear as it might be just where in RCU's idle entry/exit code RCU stops and starts watching the current CPU. This commit therefore adds comments calling out the transitions. Reported-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 156ac8d0418b..0bbcbf398169 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -238,7 +238,9 @@ void rcu_softirq_qs(void) /* * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state. + * called when not already in an extended quiescent state, that is, + * RCU is watching prior to the call to this function and is no longer + * watching upon return. */ static void rcu_dynticks_eqs_enter(void) { @@ -251,7 +253,7 @@ static void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); - /* Better be in an extended quiescent state! */ + // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); /* Better not have special action (TLB flush) pending! */ @@ -261,7 +263,8 @@ static void rcu_dynticks_eqs_enter(void) /* * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state. + * called from an extended quiescent state, that is, RCU is not watching + * prior to the call to this function and is watching upon return. */ static void rcu_dynticks_eqs_exit(void) { @@ -274,6 +277,7 @@ static void rcu_dynticks_eqs_exit(void) * critical section. */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + // RCU is now watching. Better not be in an extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { @@ -584,6 +588,7 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdp->dynticks_nesting == 0); if (rdp->dynticks_nesting != 1) { + // RCU will still be watching, so just do accounting and leave. rdp->dynticks_nesting--; return; } @@ -596,7 +601,9 @@ static void rcu_eqs_enter(bool user) rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. rcu_dynticks_task_enter(); } @@ -676,7 +683,9 @@ static __always_inline void rcu_nmi_exit_common(bool irq) if (irq) rcu_prepare_for_idle(); + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. if (irq) rcu_dynticks_task_enter(); @@ -751,11 +760,14 @@ static void rcu_eqs_exit(bool user) oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { + // RCU was already watching, so just do accounting and leave. rdp->dynticks_nesting++; return; } rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); @@ -832,7 +844,9 @@ static __always_inline void rcu_nmi_enter_common(bool irq) if (irq) rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. if (irq) rcu_cleanup_after_idle(); @@ -842,9 +856,16 @@ static __always_inline void rcu_nmi_enter_common(bool irq) rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !READ_ONCE(rdp->rcu_forced_tick)) { + // We get here only if we had already exited the extended + // quiescent state and this was an interrupt (not an NMI). + // Therefore, (1) RCU is already watching and (2) The fact + // that we are in an interrupt handler and that the rcu_node + // lock is an irq-disabled lock prevents self-deadlock. + // So we can safely recheck under the lock. raw_spin_lock_rcu_node(rdp->mynode); - // Recheck under lock. if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU + // needs a quiescent state. Turn on the tick! WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } -- cgit v1.2.3 From 66777e5821f6e672003fde697b8489402bb5aa98 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 15:22:44 -0700 Subject: rcu-tasks: Use context-switch hook for PREEMPT=y kernels Currently, the PREEMPT=y version of rcu_note_context_switch() does not invoke rcu_tasks_qs(), and we need it to in order to keep RCU Tasks Trace's IPIs down to a dull roar. This commit therefore enables this hook. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 088e84e4578f..4f34c325dd90 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -331,6 +331,8 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); + if (!preempt) + rcu_tasks_qs(current); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -- cgit v1.2.3 From 2beaf3280e57bb891f8012dca49c87ed0f01e2f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Mar 2020 14:23:21 -0700 Subject: sched/core: Add function to sample state of locked-down task A running task's state can be sampled in a consistent manner (for example, for diagnostic purposes) simply by invoking smp_call_function_single() on its CPU, which may be obtained using task_cpu(), then having the IPI handler verify that the desired task is in fact still running. However, if the task is not running, this sampling can in theory be done immediately and directly. In practice, the task might start running at any time, including during the sampling period. Gaining a consistent sample of a not-running task therefore requires that something be done to lock down the target task's state. This commit therefore adds a try_invoke_on_locked_down_task() function that invokes a specified function if the specified task can be locked down, returning true if successful and if the specified function returns true. Otherwise this function simply returns false. Given that the function passed to try_invoke_on_nonrunning_task() might be invoked with a runqueue lock held, that function had better be quite lightweight. The function is passed the target task's task_struct pointer and the argument passed to try_invoke_on_locked_down_task(), allowing easy access to task state and to a location for further variables to be passed in and out. Note that the specified function will be called even if the specified task is currently running. The function can use ->on_rq and task_curr() to quickly and easily determine the task's state, and can return false if this state is not to the function's liking. The caller of the try_invoke_on_locked_down_task() would then see the false return value, and could take appropriate action, for example, trying again later or sending an IPI if matters are more urgent. It is expected that use cases such as the RCU CPU stall warning code will simply return false if the task is currently running. However, there are use cases involving nohz_full CPUs where the specified function might instead fall back to an alternative sampling scheme that relies on heavier synchronization (such as memory barriers) in the target task. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Ben Segall Cc: Mel Gorman [ paulmck: Apply feedback from Peter Zijlstra and Steven Rostedt. ] [ paulmck: Invoke if running to handle feedback from Mathieu Desnoyers. ] Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5ca567adfcb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2566,6 +2566,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2639,6 +2641,52 @@ out: return success; } +/** + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked. + * @func: Function to invoke. + * @arg: Argument to function. + * + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. + */ +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +{ + bool ret = false; + struct rq_flags rf; + struct rq *rq; + + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq(&p->pi_lock); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); + } + } + raw_spin_unlock_irq(&p->pi_lock); + return ret; +} + /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. -- cgit v1.2.3 From 5bef8da66a9c45d3c371d74463894f1fc31dfdda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Mar 2020 17:35:46 -0700 Subject: rcu: Add per-task state to RCU CPU stall warnings Currently, an RCU-preempt CPU stall warning simply lists the PIDs of those tasks holding up the current grace period. This can be helpful, but more can be even more helpful. To this end, this commit adds the nesting level, whether the task thinks it was preempted in its current RCU read-side critical section, whether RCU core has asked this task for a quiescent state, whether the expedited-grace-period hint is set, and whether the task believes that it is on the blocked-tasks list (it must be, or it would not be printed, but if things are broken, best not to take too much for granted). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..c65c9759e038 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -192,14 +192,40 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +// Communicate task state back to the RCU CPU stall warning request. +struct rcu_stall_chk_rdr { + int nesting; + union rcu_special rs; + bool on_blkd_list; +}; + +/* + * Report out the state of a not-running task that is stalling the + * current RCU grace period. + */ +static bool check_slow_task(struct task_struct *t, void *arg) +{ + struct rcu_node *rnp; + struct rcu_stall_chk_rdr *rscrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + rscrp->nesting = t->rcu_read_lock_nesting; + rscrp->rs = t->rcu_read_unlock_special; + rnp = t->rcu_blocked_node; + rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); + return true; +} + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ static int rcu_print_task_stall(struct rcu_node *rnp) { - struct task_struct *t; int ndetected = 0; + struct rcu_stall_chk_rdr rscr; + struct task_struct *t; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -208,7 +234,15 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); + if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) + pr_cont(" P%d", t->pid); + else + pr_cont(" P%d/%d:%c%c%c%c", + t->pid, rscr.nesting, + ".b"[rscr.rs.b.blocked], + ".q"[rscr.rs.b.need_qs], + ".e"[rscr.rs.b.exp_hint], + ".l"[rscr.on_blkd_list]); ndetected++; } pr_cont("\n"); -- cgit v1.2.3 From eacd6f04a1333187dd3e96e5635c0edce0a2e354 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Mar 2020 11:59:20 -0800 Subject: rcu-tasks: Move Tasks RCU to its own file This code-movement-only commit is in preparation for adding an additional flavor of Tasks RCU, which relies on workqueues to detect grace periods. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 370 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/update.c | 366 +-------------------------------------------------- 2 files changed, 372 insertions(+), 364 deletions(-) create mode 100644 kernel/rcu/tasks.h (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h new file mode 100644 index 000000000000..be8d179a4ca9 --- /dev/null +++ b/kernel/rcu/tasks.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Task-based RCU implementations. + * + * Copyright (C) 2020 Paul E. McKenney + */ + +#ifdef CONFIG_TASKS_RCU + +/* + * Simple variant of RCU whose quiescent states are voluntary context + * switch, cond_resched_rcu_qs(), user-space execution, and idle. + * As such, grace periods can take one good long time. There are no + * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() + * because this implementation is intended to get the system into a safe + * state for some of the manipulations involved in tracing and the like. + * Finally, this implementation does not support high call_rcu_tasks() + * rates from multiple CPUs. If this is required, per-CPU callback lists + * will be needed. + */ + +/* Global list of callbacks and associated lock. */ +static struct rcu_head *rcu_tasks_cbs_head; +static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; +static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); +static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); + +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; +module_param(rcu_task_stall_timeout, int, 0644); + +static struct task_struct *rcu_tasks_kthread_ptr; + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + unsigned long flags; + bool needwake; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + needwake = !rcu_tasks_cbs_head; + WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); + rcu_tasks_cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + /* We can't create the thread unless interrupts are enabled. */ + if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) + wake_up(&rcu_tasks_cbs_wq); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu_tasks() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-tasks read-side critical section whose beginning + * preceded the call to synchronize_rcu_tasks(). In addition, each CPU + * having an RCU-tasks read-side critical section that extends beyond + * the return from synchronize_rcu_tasks() is guaranteed to have executed + * a full memory barrier after the beginning of synchronize_rcu_tasks() + * and before the beginning of that RCU-tasks read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU + * (but again only if the system has more than one CPU). + */ +void synchronize_rcu_tasks(void) +{ + /* Complain if the scheduler has not started. */ + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(call_rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct task_struct *g, *t; + unsigned long lastreport; + struct rcu_head *list; + struct rcu_head *next; + LIST_HEAD(rcu_tasks_holdouts); + int fract; + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current, HK_FLAG_RCU); + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + list = rcu_tasks_cbs_head; + rcu_tasks_cbs_head = NULL; + rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + wait_event_interruptible(rcu_tasks_cbs_wq, + READ_ONCE(rcu_tasks_cbs_head)); + if (!rcu_tasks_cbs_head) { + WARN_ON(signal_pending(current)); + schedule_timeout_interruptible(HZ/10); + } + continue; + } + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw + * transitions to complete. Invoking synchronize_rcu() + * suffices because all these transitions occur with + * interrupts disabled. Without this synchronize_rcu(), + * a read-side critical section that started before the + * grace period might be incorrectly seen as having started + * after the grace period. + * + * This synchronize_rcu() also dispenses with the + * need for a memory barrier on the first store to + * ->rcu_tasks_holdout, as it forces the store to happen + * after the beginning of the grace period. + */ + synchronize_rcu(); + + /* + * There were callbacks, so we need to wait for an + * RCU-tasks grace period. Start off by scanning + * the task list for tasks that are not already + * voluntarily blocked. Mark these tasks and make + * a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && READ_ONCE(t->on_rq) && + !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Wait for tasks that are in the process of exiting. + * This does only part of the job, ensuring that all + * tasks that were previously exiting reach the point + * where they have disabled preemption, allowing the + * later synchronize_rcu() to finish the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + + /* + * Each pass through the following loop scans the list + * of holdout tasks, removing any that are no longer + * holdouts. When the list is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + struct task_struct *t1; + + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && + time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, &firstreport); + cond_resched(); + } + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed + * to have a full memory barriers prior to them in the + * schedule() path, memory reordering on other CPUs could + * cause their RCU-tasks read-side critical sections to + * extend past the end of the grace period. However, + * because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() + * to force the needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all + * ->rcu_tasks_holdout accesses to be within the grace + * period, avoiding the need for memory barriers for + * ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting + * tasks to complete their final preempt_disable() region + * of execution, cleaning up after the synchronize_srcu() + * above. + */ + synchronize_rcu(); + + /* Invoke the callbacks. */ + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + /* Paranoid sleep to keep this from entering a tight loop */ + schedule_timeout_uninterruptible(HZ/10); + } +} + +/* Spawn rcu_tasks_kthread() at core_initcall() time. */ +static int __init rcu_spawn_tasks_kthread(void) +{ + struct task_struct *t; + + t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) + return 0; + smp_mb(); /* Ensure others see full kthread. */ + WRITE_ONCE(rcu_tasks_kthread_ptr, t); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + preempt_enable(); +} + +#endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#ifdef CONFIG_TASKS_RCU + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + else + pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 74a698aa9027..c5799349ff31 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -514,370 +514,6 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); -#ifdef CONFIG_TASKS_RCU - -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); - -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); - -/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) -static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; -module_param(rcu_task_stall_timeout, int, 0644); - -static struct task_struct *rcu_tasks_kthread_ptr; - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) -{ - unsigned long flags; - bool needwake; - - rhp->next = NULL; - rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks); - -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); -} - -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) -{ - unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; - struct rcu_head *list; - struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); - int fract; - - /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); - - /* - * Each pass through the following loop makes one check for - * newly arrived callbacks, and, if there are some, waits for - * one RCU-tasks grace period and then invokes the callbacks. - * This loop is terminated by the system going down. ;-) - */ - for (;;) { - - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - READ_ONCE(rcu_tasks_cbs_head)); - if (!rcu_tasks_cbs_head) { - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(HZ/10); - } - continue; - } - - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); - - /* Invoke the callbacks. */ - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } - /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); - } -} - -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) -{ - struct task_struct *t; - - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; - smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); - return 0; -} -core_initcall(rcu_spawn_tasks_kthread); - -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); - preempt_enable(); -} - -#endif /* #ifdef CONFIG_TASKS_RCU */ - -#ifndef CONFIG_TINY_RCU - -/* - * Print any non-default Tasks RCU settings. - */ -static void __init rcu_tasks_bootup_oddness(void) -{ -#ifdef CONFIG_TASKS_RCU - if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) - pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); -#endif /* #ifdef CONFIG_TASKS_RCU */ -} - -#endif /* #ifndef CONFIG_TINY_RCU */ - #ifdef CONFIG_PROVE_RCU /* @@ -948,6 +584,8 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#include "tasks.h" + #ifndef CONFIG_TINY_RCU /* -- cgit v1.2.3 From 07e105158d97b4969891e844f318d16f6cef566c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Mar 2020 15:16:57 -0800 Subject: rcu-tasks: Create struct to hold state information This commit creates an rcu_tasks struct to hold state information for RCU Tasks. This is a preparation commit for adding additional flavors of Tasks RCU, each of which would have its own rcu_tasks struct. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 73 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index be8d179a4ca9..5ccfe0d64e6a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -7,6 +7,30 @@ #ifdef CONFIG_TASKS_RCU +/** + * Definition for a Tasks-RCU-like mechanism. + * @cbs_head: Head of callback list. + * @cbs_tail: Tail pointer for callback list. + * @cbs_wq: Wait queue allowning new callback to get kthread's attention. + * @cbs_lock: Lock protecting callback list. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + */ +struct rcu_tasks { + struct rcu_head *cbs_head; + struct rcu_head **cbs_tail; + struct wait_queue_head cbs_wq; + raw_spinlock_t cbs_lock; + struct task_struct *kthread_ptr; +}; + +#define DEFINE_RCU_TASKS(name) \ +static struct rcu_tasks name = \ +{ \ + .cbs_tail = &name.cbs_head, \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(name.cbs_wq), \ + .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(name.cbs_lock), \ +} + /* * Simple variant of RCU whose quiescent states are voluntary context * switch, cond_resched_rcu_qs(), user-space execution, and idle. @@ -18,12 +42,7 @@ * rates from multiple CPUs. If this is required, per-CPU callback lists * will be needed. */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); +DEFINE_RCU_TASKS(rcu_tasks); /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); @@ -33,8 +52,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -static struct task_struct *rcu_tasks_kthread_ptr; - /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period * @rhp: structure to be used for queueing the RCU updates. @@ -57,17 +74,18 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; + struct rcu_tasks *rtp = &rcu_tasks; rhp->next = NULL; rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + needwake = !rtp->cbs_head; + WRITE_ONCE(*rtp->cbs_tail, rhp); + rtp->cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); + if (needwake && READ_ONCE(rtp->kthread_ptr)) + wake_up(&rtp->cbs_wq); } EXPORT_SYMBOL_GPL(call_rcu_tasks); @@ -169,10 +187,12 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *list; struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); + struct rcu_tasks *rtp = arg; int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); + WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! /* * Each pass through the following loop makes one check for @@ -183,17 +203,17 @@ static int __noreturn rcu_tasks_kthread(void *arg) for (;;) { /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + list = rtp->cbs_head; + rtp->cbs_head = NULL; + rtp->cbs_tail = &rtp->cbs_head; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); /* If there were none, wait a bit and start over. */ if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - READ_ONCE(rcu_tasks_cbs_head)); - if (!rcu_tasks_cbs_head) { + wait_event_interruptible(rtp->cbs_wq, + READ_ONCE(rtp->cbs_head)); + if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); schedule_timeout_interruptible(HZ/10); } @@ -211,7 +231,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) * * This synchronize_rcu() also dispenses with the * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen + * t->rcu_tasks_holdout, as it forces the store to happen * after the beginning of the grace period. */ synchronize_rcu(); @@ -278,7 +298,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) firstreport = true; WARN_ON(signal_pending(current)); list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { + rcu_tasks_holdout_list) { check_holdout_task(t, needreport, &firstreport); cond_resched(); } @@ -325,11 +345,10 @@ static int __init rcu_spawn_tasks_kthread(void) { struct task_struct *t; - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); + t = kthread_run(rcu_tasks_kthread, &rcu_tasks, "rcu_tasks_kthread"); if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) return 0; smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); return 0; } core_initcall(rcu_spawn_tasks_kthread); -- cgit v1.2.3 From 9cf8fc6fabd46d7f4729529f88d627ce85c6e970 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Mar 2020 14:00:46 -0800 Subject: rcutorture: Add a test for synchronize_rcu_mult() This commit adds a crude test for synchronize_rcu_mult(). This is currently a smoke test rather than a high-quality stress test. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b348cf816d89..fbb3e6247443 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -665,6 +665,11 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); } +static void synchronize_rcu_mult_test(void) +{ + synchronize_rcu_mult(call_rcu_tasks, call_rcu); +} + static struct rcu_torture_ops tasks_ops = { .ttype = RCU_TASKS_FLAVOR, .init = rcu_sync_torture_init, @@ -674,7 +679,7 @@ static struct rcu_torture_ops tasks_ops = { .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .fqs = NULL, -- cgit v1.2.3 From 5873b8a94e5dae04b8e11fc798df512614e6d1e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2020 11:49:21 -0800 Subject: rcu-tasks: Refactor RCU-tasks to allow variants to be added This commit splits out generic processing from RCU-tasks-specific processing in order to allow additional flavors to be added. It also adds a def_bool TASKS_RCU_GENERIC to enable the common RCU-tasks infrastructure code. This is primarily, but not entirely, a code-movement commit. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 10 +- kernel/rcu/tasks.h | 491 +++++++++++++++++++++++++++------------------------- kernel/rcu/update.c | 4 + 3 files changed, 269 insertions(+), 236 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1cc940fef17c..38475d0bc634 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -70,13 +70,19 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config TASKS_RCU_GENERIC + def_bool TASKS_RCU + select SRCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + config TASKS_RCU def_bool PREEMPTION - select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + user-mode execution as quiescent states. Not for manual selection. config RCU_STALL_COMMON def_bool TREE_RCU diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5ccfe0d64e6a..d77921ee5a6e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -5,7 +5,13 @@ * Copyright (C) 2020 Paul E. McKenney */ -#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); /** * Definition for a Tasks-RCU-like mechanism. @@ -14,6 +20,8 @@ * @cbs_wq: Wait queue allowning new callback to get kthread's attention. * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @gp_func: This flavor's grace-period-wait function. + * @call_func: This flavor's call_rcu()-equivalent function. */ struct rcu_tasks { struct rcu_head *cbs_head; @@ -21,29 +29,20 @@ struct rcu_tasks { struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; struct task_struct *kthread_ptr; + rcu_tasks_gp_func_t gp_func; + call_rcu_func_t call_func; }; -#define DEFINE_RCU_TASKS(name) \ +#define DEFINE_RCU_TASKS(name, gp, call) \ static struct rcu_tasks name = \ { \ .cbs_tail = &name.cbs_head, \ .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(name.cbs_wq), \ .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(name.cbs_lock), \ + .gp_func = gp, \ + .call_func = call, \ } -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ -DEFINE_RCU_TASKS(rcu_tasks); - /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); @@ -52,29 +51,16 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) { unsigned long flags; bool needwake; - struct rcu_tasks *rtp = &rcu_tasks; rhp->next = NULL; rhp->func = func; @@ -87,108 +73,25 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) if (needwake && READ_ONCE(rtp->kthread_ptr)) wake_up(&rtp->cbs_wq); } -EXPORT_SYMBOL_GPL(call_rcu_tasks); -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); + wait_rcu_gp(rtp->call_func); } /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ static int __noreturn rcu_tasks_kthread(void *arg) { unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; struct rcu_head *list; struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); struct rcu_tasks *rtp = arg; - int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); @@ -220,111 +123,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) continue; } - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * t->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); + // Wait for one grace period. + rtp->gp_func(rtp); /* Invoke the callbacks. */ while (list) { @@ -340,18 +140,16 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) +/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; - t = kthread_run(rcu_tasks_kthread, &rcu_tasks, "rcu_tasks_kthread"); + t = kthread_run(rcu_tasks_kthread, rtp, "rcu_tasks_kthread"); if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; + return; smp_mb(); /* Ensure others see full kthread. */ - return 0; } -core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -369,8 +167,6 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) preempt_enable(); } -#endif /* #ifdef CONFIG_TASKS_RCU */ - #ifndef CONFIG_TINY_RCU /* @@ -387,3 +183,230 @@ static void __init rcu_tasks_bootup_oddness(void) } #endif /* #ifndef CONFIG_TINY_RCU */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(rcu_tasks_holdouts); + int fract; + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + struct task_struct *t1; + + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, &firstreport); + cond_resched(); + } + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks); + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +#endif /* #ifdef CONFIG_TASKS_RCU */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c5799349ff31..30dce20e1644 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -584,7 +584,11 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#ifdef CONFIG_TASKS_RCU_GENERIC #include "tasks.h" +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From c84aad765406c4c7573ce449e8a9977ebb8f4cb9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Mar 2020 21:06:43 -0800 Subject: rcu-tasks: Add an RCU-tasks rude variant This commit adds a "rude" variant of RCU-tasks that has as quiescent states schedule(), cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, anyway) cond_resched(). In other words, RCU-tasks rude readers are regions of code with preemption disabled, but excluding code early in the CPU-online sequence and late in the CPU-offline sequence. Updates make use of IPIs and force an IPI and a context switch on each online CPU. This variant is useful in some situations in tracing. Suggested-by: Steven Rostedt [ paulmck: Apply EXPORT_SYMBOL_GPL() feedback from Qiujun Huang. ] Signed-off-by: Paul E. McKenney [ paulmck: Apply review feedback from Steve Rostedt. ] --- kernel/rcu/Kconfig | 11 +++++- kernel/rcu/tasks.h | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 38475d0bc634..6ee6372a4459 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -84,6 +84,15 @@ config TASKS_RCU only voluntary context switch (not preemption!), idle, and user-mode execution as quiescent states. Not for manual selection. +config TASKS_RUDE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + only context switch (including preemption) and user-mode + execution as quiescent states. It forces IPIs and context + switches on all online CPUs, including idle ones, so use + with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d77921ee5a6e..7f9ed20c26c7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -180,6 +180,9 @@ static void __init rcu_tasks_bootup_oddness(void) else pr_info("\tTasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -410,3 +413,98 @@ static int __init rcu_spawn_tasks_kthread(void) core_initcall(rcu_spawn_tasks_kthread); #endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of +// passing an empty function to schedule_on_each_cpu(). This approach +// provides an asynchronous call_rcu_tasks_rude() API and batching +// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// This sends IPIs far and wide and induces otherwise unnecessary context +// switches on all online CPUs, whether idle or not. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude); + +/** + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +/** + * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_rude(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_rude(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} +core_initcall(rcu_spawn_tasks_rude_kthread); + +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ -- cgit v1.2.3 From 3d6e43c75d6bab212e8bc142585ee36eb8e2e5d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2020 15:02:50 -0800 Subject: rcutorture: Add torture tests for RCU Tasks Rude This commit adds the definitions required to torture the rude flavor of RCU tasks. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 ++ kernel/rcu/rcu.h | 1 + kernel/rcu/rcutorture.c | 31 +++++++++++++++++++++++++++++-- 3 files changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4aa02eee8f6c..b5f3545b0cfb 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -29,6 +29,7 @@ config RCU_PERF_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU default n help This option provides a kernel module that runs performance @@ -46,6 +47,7 @@ config RCU_TORTURE_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 00ddc92c5774..c5746202e124 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -441,6 +441,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TASKS_RUDE_FLAVOR, RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fbb3e6247443..6b0663801a82 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -730,6 +730,33 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +/* + * Definitions for rude RCU-tasks torture testing. + */ + +static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_rude_ops = { + .ttype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_rude_torture_deferred_free, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .call = call_rcu_tasks_rude, + .cb_barrier = rcu_barrier_tasks_rude, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks-rude" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -739,7 +766,7 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) static bool __maybe_unused torturing_tasks(void) { - return cur_ops == &tasks_ops; + return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops; } /* @@ -2413,7 +2440,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &trivial_ops, + &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) -- cgit v1.2.3 From c97d12a63c26fc4521d0904f073f9997ae796cba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2020 15:50:31 -0800 Subject: rcu-tasks: Use unique names for RCU-Tasks kthreads and messages This commit causes the flavors of RCU Tasks to use different names for their kthreads and in their console messages. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7f9ed20c26c7..9ca83c68e486 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -22,6 +22,8 @@ typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @call_func: This flavor's call_rcu()-equivalent function. + * @name: This flavor's textual name. + * @kname: This flavor's kthread name. */ struct rcu_tasks { struct rcu_head *cbs_head; @@ -31,16 +33,20 @@ struct rcu_tasks { struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; call_rcu_func_t call_func; + char *name; + char *kname; }; -#define DEFINE_RCU_TASKS(name, gp, call) \ -static struct rcu_tasks name = \ +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static struct rcu_tasks rt_name = \ { \ - .cbs_tail = &name.cbs_head, \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(name.cbs_wq), \ - .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(name.cbs_lock), \ + .cbs_tail = &rt_name.cbs_head, \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ .gp_func = gp, \ .call_func = call, \ + .name = n, \ + .kname = #rt_name, \ } /* Track exiting tasks in order to allow them to be waited for. */ @@ -145,8 +151,8 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; - t = kthread_run(rcu_tasks_kthread, rtp, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) + t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) return; smp_mb(); /* Ensure others see full kthread. */ } @@ -342,7 +348,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) } void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period @@ -437,7 +443,8 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) } void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, + "RCU Tasks Rude"); /** * call_rcu_tasks_rude() - Queue a callback rude task-based grace period -- cgit v1.2.3 From e4fe5dd6f26f74233e217d9dd351adc3e5165bb9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Mar 2020 17:31:43 -0800 Subject: rcu-tasks: Further refactor RCU-tasks to allow adding more variants This commit refactors RCU tasks to allow variants to be added. These variants will share the current Tasks-RCU tasklist scan and the holdout list processing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 166 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9ca83c68e486..344426e2390d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -12,6 +12,11 @@ struct rcu_tasks; typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); +typedef void (*pregp_func_t)(void); +typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); +typedef void (*postscan_func_t)(void); +typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); +typedef void (*postgp_func_t)(void); /** * Definition for a Tasks-RCU-like mechanism. @@ -21,6 +26,11 @@ typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. + * @pregp_func: This flavor's pre-grace-period function (optional). + * @pertask_func: This flavor's per-task scan function (optional). + * @postscan_func: This flavor's post-task scan function (optional). + * @holdout_func: This flavor's holdout-list scan function (optional). + * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. * @name: This flavor's textual name. * @kname: This flavor's kthread name. @@ -32,6 +42,11 @@ struct rcu_tasks { raw_spinlock_t cbs_lock; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; + pregp_func_t pregp_func; + pertask_func_t pertask_func; + postscan_func_t postscan_func; + holdouts_func_t holdouts_func; + postgp_func_t postgp_func; call_rcu_func_t call_func; char *name; char *kname; @@ -113,6 +128,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Pick up any new callbacks. */ raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + smp_mb__after_unlock_lock(); // Order updates vs. GP. list = rtp->cbs_head; rtp->cbs_head = NULL; rtp->cbs_tail = &rtp->cbs_head; @@ -207,6 +223,49 @@ static void __init rcu_tasks_bootup_oddness(void) // rates from multiple CPUs. If this is required, per-CPU callback lists // will be needed. +/* Pre-grace-period preparation. */ +static void rcu_tasks_pregp_step(void) +{ + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); +} + +/* Per-task initial processing. */ +static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) +{ + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, hop); + } +} + +/* Processing between scanning taskslist and draining the holdout list. */ +void rcu_tasks_postscan(void) +{ + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); +} + /* See if tasks are still holding out, complain if so. */ static void check_holdout_task(struct task_struct *t, bool needreport, bool *firstreport) @@ -239,55 +298,63 @@ static void check_holdout_task(struct task_struct *t, sched_show_task(t); } +/* Scan the holdout lists for tasks no longer holding out. */ +static void check_all_holdout_tasks(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *t, *t1; + + list_for_each_entry_safe(t, t1, hop, rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, firstreport); + cond_resched(); + } +} + +/* Finish off the Tasks-RCU grace period. */ +static void rcu_tasks_postgp(void) +{ + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + /* Wait for one RCU-tasks grace period. */ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) { struct task_struct *g, *t; unsigned long lastreport; - LIST_HEAD(rcu_tasks_holdouts); + LIST_HEAD(holdouts); int fract; - /* - * Wait for all pre-existing t->on_rq and t->nvcsw transitions - * to complete. Invoking synchronize_rcu() suffices because all - * these transitions occur with interrupts disabled. Without this - * synchronize_rcu(), a read-side critical section that started - * before the grace period might be incorrectly seen as having - * started after the grace period. - * - * This synchronize_rcu() also dispenses with the need for a - * memory barrier on the first store to t->rcu_tasks_holdout, - * as it forces the store to happen after the beginning of the - * grace period. - */ - synchronize_rcu(); + rtp->pregp_func(); /* * There were callbacks, so we need to wait for an RCU-tasks * grace period. Start off by scanning the task list for tasks * that are not already voluntarily blocked. Mark these tasks - * and make a list of them in rcu_tasks_holdouts. + * and make a list of them in holdouts. */ rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); rcu_read_unlock(); - /* - * Wait for tasks that are in the process of exiting. This - * does only part of the job, ensuring that all tasks that were - * previously exiting reach the point where they have disabled - * preemption, allowing the later synchronize_rcu() to finish - * the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); + rtp->postscan_func(); /* * Each pass through the following loop scans the list of holdout @@ -303,9 +370,8 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) bool firstreport; bool needreport; int rtst; - struct task_struct *t1; - if (list_empty(&rcu_tasks_holdouts)) + if (list_empty(&holdouts)) break; /* Slowly back off waiting for holdouts */ @@ -320,31 +386,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) lastreport = jiffies; firstreport = true; WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } + rtp->holdouts_func(&holdouts, needreport, &firstreport); } - /* - * Because ->on_rq and ->nvcsw are not guaranteed to have a full - * memory barriers prior to them in the schedule() path, memory - * reordering on other CPUs could cause their RCU-tasks read-side - * critical sections to extend past the end of the grace period. - * However, because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() to force the - * needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all ->rcu_tasks_holdout - * accesses to be within the grace period, avoiding the need for - * memory barriers for ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting tasks - * to complete their final preempt_disable() region of execution, - * cleaning up after the synchronize_srcu() above. - */ - synchronize_rcu(); + rtp->postgp_func(); } void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); @@ -413,6 +458,11 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + rcu_tasks.pregp_func = rcu_tasks_pregp_step; + rcu_tasks.pertask_func = rcu_tasks_pertask; + rcu_tasks.postscan_func = rcu_tasks_postscan; + rcu_tasks.holdouts_func = check_all_holdout_tasks; + rcu_tasks.postgp_func = rcu_tasks_postgp; rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } -- cgit v1.2.3 From d01aa2633b5d5ebc16fa47ad7a5e8e9f00482554 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Mar 2020 17:07:07 -0800 Subject: rcu-tasks: Code movement to allow more Tasks RCU variants This commit does nothing but move rcu_tasks_wait_gp() up to a new section for common code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 122 +++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 344426e2390d..d8b09d5d1db1 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -211,6 +211,69 @@ static void __init rcu_tasks_bootup_oddness(void) #ifdef CONFIG_TASKS_RCU +//////////////////////////////////////////////////////////////////////// +// +// Shared code between task-list-scanning variants of Tasks RCU. + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(holdouts); + int fract; + + rtp->pregp_func(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + + rtp->postscan_func(); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + + if (list_empty(&holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + rtp->holdouts_func(&holdouts, needreport, &firstreport); + } + + rtp->postgp_func(); +} + //////////////////////////////////////////////////////////////////////// // // Simple variant of RCU whose quiescent states are voluntary context @@ -333,65 +396,6 @@ static void rcu_tasks_postgp(void) synchronize_rcu(); } -/* Wait for one RCU-tasks grace period. */ -static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) -{ - struct task_struct *g, *t; - unsigned long lastreport; - LIST_HEAD(holdouts); - int fract; - - rtp->pregp_func(); - - /* - * There were callbacks, so we need to wait for an RCU-tasks - * grace period. Start off by scanning the task list for tasks - * that are not already voluntarily blocked. Mark these tasks - * and make a list of them in holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) - rtp->pertask_func(t, &holdouts); - rcu_read_unlock(); - - rtp->postscan_func(); - - /* - * Each pass through the following loop scans the list of holdout - * tasks, removing any that are no longer holdouts. When the list - * is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - - if (list_empty(&holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - rtp->holdouts_func(&holdouts, needreport, &firstreport); - } - - rtp->postgp_func(); -} - void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); -- cgit v1.2.3 From d5f177d35c24429c87db2567d20563fc16f7e8f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Mar 2020 19:56:53 -0700 Subject: rcu-tasks: Add an RCU Tasks Trace to simplify protection of tracing hooks Because RCU does not watch exception early-entry/late-exit, idle-loop, or CPU-hotplug execution, protection of tracing and BPF operations is needlessly complicated. This commit therefore adds a variant of Tasks RCU that: o Has explicit read-side markers to allow finite grace periods in the face of in-kernel loops for PREEMPT=n builds. These markers are rcu_read_lock_trace() and rcu_read_unlock_trace(). o Protects code in the idle loop, exception entry/exit, and CPU-hotplug code paths. In this respect, RCU-tasks trace is similar to SRCU, but with lighter-weight readers. o Avoids expensive read-side instruction, having overhead similar to that of Preemptible RCU. There are of course downsides: o The grace-period code can send IPIs to CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. This is mitigated by later commits. o It is necessary to scan the full tasklist, much as for Tasks RCU. o There is a single callback queue guarded by a single lock, again, much as for Tasks RCU. However, those early use cases that request multiple grace periods in quick succession are expected to do so from a single task, which makes the single lock almost irrelevant. If needed, multiple callback queues can be provided using any number of schemes. Perhaps most important, this variant of RCU does not affect the vanilla flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace readers can operate from idle, offline, and exception entry/exit in no way enables rcu_preempt and rcu_sched readers to do so. The memory ordering was outlined here: https://lore.kernel.org/lkml/20200319034030.GX3199@paulmck-ThinkPad-P72/ This effort benefited greatly from off-list discussions of BPF requirements with Alexei Starovoitov and Andrii Nakryiko. At least some of the on-list discussions are captured in the Link: tags below. In addition, KCSAN was quite helpful in finding some early bugs. Link: https://lore.kernel.org/lkml/20200219150744.428764577@infradead.org/ Link: https://lore.kernel.org/lkml/87mu8p797b.fsf@nanos.tec.linutronix.de/ Link: https://lore.kernel.org/lkml/20200225221305.605144982@linutronix.de/ Cc: Alexei Starovoitov Cc: Andrii Nakryiko [ paulmck: Apply feedback from Steve Rostedt and Joel Fernandes. ] [ paulmck: Decrement trc_n_readers_need_end upon IPI failure. ] [ paulmck: Fix locking issue reported by rcutorture. ] Signed-off-by: Paul E. McKenney --- kernel/fork.c | 4 + kernel/rcu/Kconfig | 11 +- kernel/rcu/tasks.h | 361 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 371 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..72e9396235b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1683,6 +1683,10 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 6ee6372a4459..cb1d18ef343c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU || TASKS_RUDE_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -93,6 +93,15 @@ config TASKS_RUDE_RCU switches on all online CPUs, including idle ones, so use with caution. +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d8b09d5d1db1..fd34fd673a8c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -181,12 +181,17 @@ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) preempt_enable(); } +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) { + struct task_struct *t = current; + preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); preempt_enable(); + exit_tasks_rcu_finish_trace(t); } #ifndef CONFIG_TINY_RCU @@ -196,15 +201,19 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) */ static void __init rcu_tasks_bootup_oddness(void) { -#ifdef CONFIG_TASKS_RCU +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU pr_info("\tRude variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -569,3 +578,347 @@ static int __init rcu_spawn_tasks_rude_kthread(void) core_initcall(rcu_spawn_tasks_rude_kthread); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +atomic_t trc_n_readers_need_end; // Number of waited-for readers. +DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + if (atomic_dec_and_test(&trc_n_readers_need_end)) + wake_up(&trc_wait); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + // If the task is in a read-side critical section, set up its + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + if (unlikely(t->trc_reader_nesting)) { + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + } + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Wait for CPU-hotplug paths to complete. + cpus_read_lock(); + cpus_read_unlock(); + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + t->trc_reader_checked = false; + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* Do intermediate processing between task and holdout scans. */ +static void rcu_tasks_trace_postscan(void) +{ + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool ndrpt, bool *frptp) +{ + struct task_struct *g, *t; + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(void) +{ + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + wait_event_idle_exclusive(trc_wait, + atomic_read(&trc_n_readers_need_end) == 0); + + smp_mb(); // Caller's code must be ordered after wakeup. +} + +/* Report any needed quiescent state for this exiting task. */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) + rcu_read_unlock_trace_special(t); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ -- cgit v1.2.3 From c1a76c0b6abac4e7eb49b5c24a0829f47b70769d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Mar 2020 10:32:30 -0700 Subject: rcutorture: Add torture tests for RCU Tasks Trace This commit adds the definitions required to torture the tracing flavor of RCU tasks. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 ++ kernel/rcu/rcu.h | 1 + kernel/rcu/rcutorture.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index b5f3545b0cfb..452feae8de20 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -30,6 +30,7 @@ config RCU_PERF_TEST select SRCU select TASKS_RCU select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -48,6 +49,7 @@ config RCU_TORTURE_TEST select SRCU select TASKS_RCU select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index c5746202e124..72903867833e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -442,6 +442,7 @@ enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, RCU_TASKS_RUDE_FLAVOR, + RCU_TASKS_TRACING_FLAVOR, RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6b0663801a82..0bec9254959b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "rcu.h" @@ -757,6 +758,45 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +/* + * Definitions for tracing RCU-tasks torture testing. + */ + +static int tasks_tracing_torture_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_tracing_torture_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static void rcu_tasks_tracing_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_trace(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_tracing_ops = { + .ttype = RCU_TASKS_TRACING_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_tracing_torture_read_lock, + .read_delay = srcu_read_delay, /* just reuse srcu's version. */ + .readunlock = tasks_tracing_torture_read_unlock, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_tracing_torture_deferred_free, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .call = call_rcu_tasks_trace, + .cb_barrier = rcu_barrier_tasks_trace, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .slow_gps = 1, + .name = "tasks-tracing" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1323,6 +1363,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(srcu_ctlp) || + rcu_read_lock_trace_held() || torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ @@ -2440,7 +2481,8 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, &trivial_ops, + &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, + &tasks_tracing_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) -- cgit v1.2.3 From 4593e772b5020e714e18f6e212d70b24fbe88b79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Mar 2020 12:13:53 -0700 Subject: rcu-tasks: Add stall warnings for RCU Tasks Trace This commit adds RCU CPU stall warnings for RCU Tasks Trace. These dump out any tasks blocking the current grace period, as well as any CPUs that have not responded to an IPI request. This happens in two phases, when initially extracting state from the tasks and later when waiting for any holdout tasks to check in. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fd34fd673a8c..4237881e7780 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -798,9 +798,41 @@ static void rcu_tasks_trace_postscan(void) // Any tasks that exit after this point will set ->trc_reader_checked. } +/* Show the state of a task stalling the current RCU tasks trace GP. */ +static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) +{ + int cpu; + + if (*firstreport) { + pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); + *firstreport = false; + } + // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). + cpu = task_cpu(t); + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], + ".i"[is_idle_task(t)], + ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], + t->trc_reader_nesting, + " N"[!!t->trc_reader_need_end], + cpu); + sched_show_task(t); +} + +/* List stalled IPIs for RCU tasks trace. */ +static void show_stalled_ipi_trace(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + if (per_cpu(trc_ipi_to_cpu, cpu)) + pr_alert("\tIPI outstanding to CPU %d\n", cpu); +} + /* Do one scan of the holdout list. */ static void check_all_holdout_tasks_trace(struct list_head *hop, - bool ndrpt, bool *frptp) + bool needreport, bool *firstreport) { struct task_struct *g, *t; @@ -813,21 +845,51 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, // If check succeeded, remove this task from the list. if (READ_ONCE(t->trc_reader_checked)) trc_del_holdout(t); + else if (needreport) + show_stalled_task_trace(t, firstreport); + } + if (needreport) { + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); + show_stalled_ipi_trace(); } } /* Wait for grace period to complete and provide ordering. */ static void rcu_tasks_trace_postgp(void) { + bool firstreport; + struct task_struct *g, *t; + LIST_HEAD(holdouts); + long ret; + // Remove the safety count. smp_mb__before_atomic(); // Order vs. earlier atomics atomic_dec(&trc_n_readers_need_end); smp_mb__after_atomic(); // Order vs. later atomics // Wait for readers. - wait_event_idle_exclusive(trc_wait, - atomic_read(&trc_n_readers_need_end) == 0); - + for (;;) { + ret = wait_event_idle_exclusive_timeout( + trc_wait, + atomic_read(&trc_n_readers_need_end) == 0, + READ_ONCE(rcu_task_stall_timeout)); + if (ret) + break; // Count reached zero. + for_each_process_thread(g, t) + if (READ_ONCE(t->trc_reader_need_end)) + trc_add_holdout(t, &holdouts); + firstreport = true; + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) + if (READ_ONCE(t->trc_reader_need_end)) { + show_stalled_task_trace(t, &firstreport); + trc_del_holdout(t); + } + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); + show_stalled_ipi_trace(); + pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); + } smp_mb(); // Caller's code must be ordered after wakeup. } -- cgit v1.2.3 From 8fd8ca388ccf233b8ae0b6b42ddc7caa5034ae85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Mar 2020 14:51:20 -0700 Subject: rcu-tasks: Move #ifdef into tasks.h This commit pushes the #ifdef CONFIG_TASKS_RCU_GENERIC from kernel/rcu/update.c to kernel/rcu/tasks.h in order to improve readability as more APIs are added. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++++ kernel/rcu/update.c | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4237881e7780..b7e3e1d28a5c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -5,6 +5,7 @@ * Copyright (C) 2020 Paul E. McKenney */ +#ifdef CONFIG_TASKS_RCU_GENERIC //////////////////////////////////////////////////////////////////////// // @@ -984,3 +985,7 @@ core_initcall(rcu_spawn_tasks_trace_kthread); #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ + +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 30dce20e1644..c5799349ff31 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -584,11 +584,7 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ -#ifdef CONFIG_TASKS_RCU_GENERIC #include "tasks.h" -#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ -static inline void rcu_tasks_bootup_oddness(void) {} -#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From e21408ceec2de5be418efa39feb1e2c00f824a72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 11:01:55 -0700 Subject: rcu-tasks: Add RCU tasks to rcutorture writer stall output This commit adds state for each RCU-tasks flavor to the rcutorture writer stall output. The initial state is minimal, but you have to start somewhere. Signed-off-by: Paul E. McKenney [ paulmck: Fixes based on feedback from kbuild test robot. ] --- kernel/rcu/rcu.h | 1 + kernel/rcu/tasks.h | 45 +++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree_stall.h | 2 +- 3 files changed, 45 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 72903867833e..e1089fdf8626 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -431,6 +431,7 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void show_rcu_tasks_gp_kthreads(void); void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b7e3e1d28a5c..e4f89425d2c4 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -219,6 +219,16 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifndef CONFIG_TINY_RCU */ +/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ +static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) +{ + pr_info("%s %c%c %s\n", + rtp->kname, + ".k"[!!data_race(rtp->kthread_ptr)], + ".C"[!!data_race(rtp->cbs_head)], + s); +} + #ifdef CONFIG_TASKS_RCU //////////////////////////////////////////////////////////////////////// @@ -482,7 +492,14 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); -#endif /* #ifdef CONFIG_TASKS_RCU */ +static void show_rcu_tasks_classic_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); +} + +#else /* #ifdef CONFIG_TASKS_RCU */ +static void show_rcu_tasks_classic_gp_kthread(void) { } +#endif /* #else #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU @@ -578,7 +595,14 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); -#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +static void show_rcu_tasks_rude_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); +} + +#else /* #ifdef CONFIG_TASKS_RUDE_RCU */ +static void show_rcu_tasks_rude_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -982,10 +1006,27 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); +static void show_rcu_tasks_trace_gp_kthread(void) +{ + char buf[32]; + + sprintf(buf, "N%d", atomic_read(&trc_n_readers_need_end)); + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); +} + #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ +void show_rcu_tasks_gp_kthreads(void) +{ + show_rcu_tasks_classic_gp_kthread(); + show_rcu_tasks_rude_gp_kthread(); + show_rcu_tasks_trace_gp_kthread(); +} + #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} +void show_rcu_tasks_gp_kthreads(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c65c9759e038..e1c68a74574f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -649,7 +649,7 @@ void show_rcu_gp_kthreads(void) if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } - /* sched_show_task(rcu_state.gp_kthread); */ + show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -- cgit v1.2.3 From af051ca4e4231fcf5f366e28453ac28208bb36c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 12:13:33 -0700 Subject: rcu-tasks: Make rcutorture writer stall output include GP state This commit adds grace-period state and time to the rcutorture writer stall output. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e4f89425d2c4..c93fb29b460c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -17,7 +17,7 @@ typedef void (*pregp_func_t)(void); typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); typedef void (*postscan_func_t)(void); typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); -typedef void (*postgp_func_t)(void); +typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** * Definition for a Tasks-RCU-like mechanism. @@ -27,6 +27,9 @@ typedef void (*postgp_func_t)(void); * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. + * @gp_state: Grace period's most recent state transition (debugging). + * @gp_jiffies: Time of last @gp_state transition. + * @gp_start: Most recent grace-period start in jiffies. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -41,6 +44,8 @@ struct rcu_tasks { struct rcu_head **cbs_tail; struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; + int gp_state; + unsigned long gp_jiffies; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; @@ -73,10 +78,56 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +/* RCU tasks grace-period state for debugging. */ +#define RTGS_INIT 0 +#define RTGS_WAIT_WAIT_CBS 1 +#define RTGS_WAIT_GP 2 +#define RTGS_PRE_WAIT_GP 3 +#define RTGS_SCAN_TASKLIST 4 +#define RTGS_POST_SCAN_TASKLIST 5 +#define RTGS_WAIT_SCAN_HOLDOUTS 6 +#define RTGS_SCAN_HOLDOUTS 7 +#define RTGS_POST_GP 8 +#define RTGS_WAIT_READERS 9 +#define RTGS_INVOKE_CBS 10 +#define RTGS_WAIT_CBS 11 +static const char * const rcu_tasks_gp_state_names[] = { + "RTGS_INIT", + "RTGS_WAIT_WAIT_CBS", + "RTGS_WAIT_GP", + "RTGS_PRE_WAIT_GP", + "RTGS_SCAN_TASKLIST", + "RTGS_POST_SCAN_TASKLIST", + "RTGS_WAIT_SCAN_HOLDOUTS", + "RTGS_SCAN_HOLDOUTS", + "RTGS_POST_GP", + "RTGS_WAIT_READERS", + "RTGS_INVOKE_CBS", + "RTGS_WAIT_CBS", +}; + //////////////////////////////////////////////////////////////////////// // // Generic code. +/* Record grace-period phase and time. */ +static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) +{ + rtp->gp_state = newstate; + rtp->gp_jiffies = jiffies; +} + +/* Return state name. */ +static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) +{ + int i = data_race(rtp->gp_state); // Let KCSAN detect update races + int j = READ_ONCE(i); // Prevent the compiler from reading twice + + if (j >= ARRAY_SIZE(rcu_tasks_gp_state_names)) + return "???"; + return rcu_tasks_gp_state_names[j]; +} + // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) @@ -141,15 +192,18 @@ static int __noreturn rcu_tasks_kthread(void *arg) READ_ONCE(rtp->cbs_head)); if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); schedule_timeout_interruptible(HZ/10); } continue; } // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); rtp->gp_func(rtp); /* Invoke the callbacks. */ + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); while (list) { next = list->next; local_bh_disable(); @@ -160,6 +214,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) } /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_uninterruptible(HZ/10); + + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } } @@ -222,8 +278,11 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - pr_info("%s %c%c %s\n", + pr_info("%s: %s(%d) since %lu %c%c %s\n", rtp->kname, + tasks_gp_state_getname(rtp), + data_race(rtp->gp_state), + jiffies - data_race(rtp->gp_jiffies), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[!!data_race(rtp->cbs_head)], s); @@ -243,6 +302,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) LIST_HEAD(holdouts); int fract; + set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); rtp->pregp_func(); /* @@ -251,11 +311,13 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * that are not already voluntarily blocked. Mark these tasks * and make a list of them in holdouts. */ + set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); rcu_read_lock(); for_each_process_thread(g, t) rtp->pertask_func(t, &holdouts); rcu_read_unlock(); + set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); rtp->postscan_func(); /* @@ -277,6 +339,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) break; /* Slowly back off waiting for holdouts */ + set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); schedule_timeout_interruptible(HZ/fract); if (fract > 1) @@ -288,10 +351,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) lastreport = jiffies; firstreport = true; WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); rtp->holdouts_func(&holdouts, needreport, &firstreport); } - rtp->postgp_func(); + set_tasks_gp_state(rtp, RTGS_POST_GP); + rtp->postgp_func(rtp); } //////////////////////////////////////////////////////////////////////// @@ -394,7 +459,7 @@ static void check_all_holdout_tasks(struct list_head *hop, } /* Finish off the Tasks-RCU grace period. */ -static void rcu_tasks_postgp(void) +static void rcu_tasks_postgp(struct rcu_tasks *rtp) { /* * Because ->on_rq and ->nvcsw are not guaranteed to have a full @@ -881,7 +946,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, } /* Wait for grace period to complete and provide ordering. */ -static void rcu_tasks_trace_postgp(void) +static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) { bool firstreport; struct task_struct *g, *t; @@ -894,6 +959,7 @@ static void rcu_tasks_trace_postgp(void) smp_mb__after_atomic(); // Order vs. later atomics // Wait for readers. + set_tasks_gp_state(rtp, RTGS_WAIT_READERS); for (;;) { ret = wait_event_idle_exclusive_timeout( trc_wait, @@ -901,6 +967,7 @@ static void rcu_tasks_trace_postgp(void) READ_ONCE(rcu_task_stall_timeout)); if (ret) break; // Count reached zero. + // Stall warning time, so make a list of the offenders. for_each_process_thread(g, t) if (READ_ONCE(t->trc_reader_need_end)) trc_add_holdout(t, &holdouts); -- cgit v1.2.3 From 43766c3eadcf6033c92eb953f88801aebac0f785 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 20:38:29 -0700 Subject: rcu-tasks: Make RCU Tasks Trace make use of RCU scheduler hooks This commit makes the calls to rcu_tasks_qs() detect and report quiescent states for RCU tasks trace. If the task is in a quiescent state and if ->trc_reader_checked is not yet set, the task sets its own ->trc_reader_checked. This will cause the grace-period kthread to remove it from the holdout list if it still remains there. [ paulmck: Fix conditional compilation per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++-- kernel/rcu/tree_plugin.h | 6 ++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c93fb29b460c..6f8a4040fbdd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -180,7 +180,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Pick up any new callbacks. */ raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - smp_mb__after_unlock_lock(); // Order updates vs. GP. + smp_mb__after_spinlock(); // Order updates vs. GP. list = rtp->cbs_head; rtp->cbs_head = NULL; rtp->cbs_tail = &rtp->cbs_head; @@ -874,7 +874,7 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { WRITE_ONCE(t->trc_reader_need_end, false); - t->trc_reader_checked = false; + WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); } @@ -983,6 +983,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); } smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. } /* Report any needed quiescent state for this exiting task. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4f34c325dd90..37e02812d18f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -331,8 +331,7 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -841,8 +840,7 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); out: trace_rcu_utilization(TPS("End context switch")); } -- cgit v1.2.3 From 88092d0c99d7584d50cc8caadb8fa9ff8a1d4ea0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 08:57:02 -0700 Subject: rcu-tasks: Add a grace-period start time for throttling and debug This commit adds a place to record the grace-period start in jiffies. This will be used by later commits for debugging purposes and to throttle IPIs early in the grace period. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6f8a4040fbdd..71462cf3d4bd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -46,6 +46,7 @@ struct rcu_tasks { raw_spinlock_t cbs_lock; int gp_state; unsigned long gp_jiffies; + unsigned long gp_start; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; @@ -200,6 +201,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) // Wait for one grace period. set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; rtp->gp_func(rtp); /* Invoke the callbacks. */ -- cgit v1.2.3 From b0afa0f056676ffe0a7213818f09d2460adbcc16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 11:39:26 -0700 Subject: rcu-tasks: Provide boot parameter to delay IPIs until late in grace period This commit provides a rcupdate.rcu_task_ipi_delay kernel boot parameter that specifies how old the RCU tasks trace grace period must be before the grace-period kthread starts sending IPIs. This delay allows more tasks to pass through rcu_tasks_qs() quiescent states, thus reducing (or even eliminating) the number of IPIs that must be sent. On a short rcutorture test setting this kernel boot parameter to HZ/2 resulted in zero IPIs for all 877 RCU-tasks trace grace periods that elapsed during that test. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 71462cf3d4bd..eeac4a122234 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -74,6 +74,11 @@ static struct rcu_tasks rt_name = \ /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); +/* Avoid IPIing CPUs early in the grace period. */ +#define RCU_TASK_IPI_DELAY (HZ / 2) +static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; +module_param(rcu_task_ipi_delay, int, 0644); + /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; @@ -713,6 +718,10 @@ DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t) { @@ -998,10 +1007,6 @@ void exit_tasks_rcu_finish_trace(struct task_struct *t) rcu_read_unlock_trace_special(t); } -void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, - "RCU Tasks Trace"); - /** * call_rcu_tasks_trace() - Queue a callback trace task-based grace period * @rhp: structure to be used for queueing the RCU updates. -- cgit v1.2.3 From 276c410448dbca357a2bc3539acfe04862e5f172 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 16:02:06 -0700 Subject: rcu-tasks: Split ->trc_reader_need_end This commit splits ->trc_reader_need_end by using the rcu_special union. This change permits readers to check to see if a memory barrier is required without any added overhead in the common case where no such barrier is required. This commit also adds the read-side checking. Later commits will add the machinery to properly set the new ->trc_reader_special.b.need_mb field. This commit also makes rcu_read_unlock_trace_special() tolerate nested read-side critical sections within interrupt and NMI handlers. Signed-off-by: Paul E. McKenney --- kernel/fork.c | 1 + kernel/rcu/tasks.h | 33 ++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 72e9396235b4..96eb4b535ced 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1685,6 +1685,7 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index eeac4a122234..17b1b9a31071 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -723,10 +723,17 @@ DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); /* If we are the last reader, wake up the grace-period kthread. */ -void rcu_read_unlock_trace_special(struct task_struct *t) +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - WRITE_ONCE(t->trc_reader_need_end, false); - if (atomic_dec_and_test(&trc_n_readers_need_end)) + int nq = t->trc_reader_special.b.need_qs; + + if (t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) wake_up(&trc_wait); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -777,8 +784,8 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -804,8 +811,8 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // exit from that critical section. if (unlikely(t->trc_reader_nesting)) { atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); } return true; } @@ -884,7 +891,7 @@ static void rcu_tasks_trace_pregp_step(void) static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { - WRITE_ONCE(t->trc_reader_need_end, false); + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); @@ -916,7 +923,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], t->trc_reader_nesting, - " N"[!!t->trc_reader_need_end], + " N"[!!t->trc_reader_special.b.need_qs], cpu); sched_show_task(t); } @@ -980,11 +987,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) break; // Count reached zero. // Stall warning time, so make a list of the offenders. for_each_process_thread(g, t) - if (READ_ONCE(t->trc_reader_need_end)) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); firstreport = true; list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) - if (READ_ONCE(t->trc_reader_need_end)) { + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { show_stalled_task_trace(t, &firstreport); trc_del_holdout(t); } @@ -1003,8 +1010,8 @@ void exit_tasks_rcu_finish_trace(struct task_struct *t) WRITE_ONCE(t->trc_reader_checked, true); WARN_ON_ONCE(t->trc_reader_nesting); WRITE_ONCE(t->trc_reader_nesting, 0); - if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) - rcu_read_unlock_trace_special(t); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); } /** -- cgit v1.2.3 From 238dbce39ea467577ce7e41ee3e98748c436ed0f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Mar 2020 10:54:05 -0700 Subject: rcu-tasks: Add grace-period and IPI counts to statistics This commit adds a grace-period count and a count of IPIs sent since boot, which is printed in response to rcutorture writer stalls and at the end of rcutorture testing. These counts will be used to evaluate various schemes to reduce the number of IPIs sent. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 17b1b9a31071..4857450c0430 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -30,6 +30,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @gp_state: Grace period's most recent state transition (debugging). * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. + * @n_gps: Number of grace periods completed since boot. + * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -47,6 +49,8 @@ struct rcu_tasks { int gp_state; unsigned long gp_jiffies; unsigned long gp_start; + unsigned long n_gps; + unsigned long n_ipis; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; @@ -208,6 +212,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) set_tasks_gp_state(rtp, RTGS_WAIT_GP); rtp->gp_start = jiffies; rtp->gp_func(rtp); + rtp->n_gps++; /* Invoke the callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); @@ -285,11 +290,12 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - pr_info("%s: %s(%d) since %lu %c%c %s\n", + pr_info("%s: %s(%d) since %lu g:%lu i:%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), + data_race(rtp->n_gps), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[!!data_race(rtp->cbs_head)], s); @@ -592,6 +598,7 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { + rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } @@ -856,6 +863,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, atomic_inc(&trc_n_readers_need_end); per_cpu(trc_ipi_to_cpu, cpu) = true; t->trc_ipi_to_cpu = cpu; + rcu_tasks_trace.n_ipis++; if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) { // Just in case there is some other reason for -- cgit v1.2.3 From 9ae58d7bd11f1fc4c96389df11751f8593d8bd33 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Mar 2020 17:16:37 -0700 Subject: rcu-tasks: Add Kconfig option to mediate smp_mb() vs. IPI This commit provides a new TASKS_TRACE_RCU_READ_MB Kconfig option that enables use of read-side memory barriers by both rcu_read_lock_trace() and rcu_read_unlock_trace() when the are executed with the current->trc_reader_special.b.need_mb flag set. This flag is currently never set. Doing that is the subject of a later commit. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 18 ++++++++++++++++++ kernel/rcu/tasks.h | 3 ++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index cb1d18ef343c..0ebe15a84985 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -234,4 +234,22 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4857450c0430..4147857007d7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -734,7 +734,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { int nq = t->trc_reader_special.b.need_qs; - if (t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. if (nq) -- cgit v1.2.3 From 7d0c9c50c5a109acd7a5cf589fc5563f9ef7149a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Mar 2020 15:33:12 -0700 Subject: rcu-tasks: Avoid IPIing userspace/idle tasks if kernel is so built Systems running CPU-bound real-time task do not want IPIs sent to CPUs executing nohz_full userspace tasks. Battery-powered systems don't want IPIs sent to idle CPUs in low-power mode. Unfortunately, RCU tasks trace can and will send such IPIs in some cases. Both of these situations occur only when the target CPU is in RCU dyntick-idle mode, in other words, when RCU is not watching the target CPU. This suggests that CPUs in dyntick-idle mode should use memory barriers in outermost invocations of rcu_read_lock_trace() and rcu_read_unlock_trace(), which would allow the RCU tasks trace grace period to directly read out the target CPU's read-side state. One challenge is that RCU tasks trace is not targeting a specific CPU, but rather a task. And that task could switch from one CPU to another at any time. This commit therefore uses try_invoke_on_locked_down_task() and checks for task_curr() in trc_inspect_reader_notrunning(). When this condition holds, the target task is running and cannot move. If CONFIG_TASKS_TRACE_RCU_READ_MB=y, the new rcu_dynticks_zero_in_eqs() function can be used to check if the specified integer (in this case, t->trc_reader_nesting) is zero while the target CPU remains in that same dyntick-idle sojourn. If so, the target task is in a quiescent state. If not, trc_read_check_handler() must indicate failure so that the grace-period kthread can take appropriate action or retry after an appropriate delay, as the case may be. With this change, given CONFIG_TASKS_TRACE_RCU_READ_MB=y, if a given CPU remains idle or a given task continues executing in nohz_full mode, the RCU tasks trace grace-period kthread will detect this without the need to send an IPI. Suggested-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/tasks.h | 36 ++++++++++++++++++++++++++---------- kernel/rcu/tree.c | 24 ++++++++++++++++++++++++ kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 18 ++++++++++++++++++ 5 files changed, 72 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e1089fdf8626..296f9262d119 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -501,6 +501,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU +static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -510,6 +511,7 @@ static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4147857007d7..a9e8ecb10860 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -806,22 +806,38 @@ reset_ipi: /* Callback function for scheduler to check locked-down task. */ static bool trc_inspect_reader(struct task_struct *t, void *arg) { - if (task_curr(t)) - return false; // It is running, so decline to inspect it. + int cpu = task_cpu(t); + bool in_qs = false; + + if (task_curr(t)) { + // If no chance of heavyweight readers, do it the hard way. + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + return false; + + // If heavyweight readers are enabled on the remote task, + // we can inspect its state despite its currently running. + // However, we cannot safely change its state. + if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + return false; // No quiescent state, do it the hard way. + in_qs = true; + } else { + in_qs = likely(!t->trc_reader_nesting); + } // Mark as checked. Because this is called from the grace-period // kthread, also remove the task from the holdout list. t->trc_reader_checked = true; trc_del_holdout(t); - // If the task is in a read-side critical section, set up its - // its state so that it will awaken the grace-period kthread upon - // exit from that critical section. - if (unlikely(t->trc_reader_nesting)) { - atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); - WRITE_ONCE(t->trc_reader_special.b.need_qs, true); - } + if (in_qs) + return true; // Already in quiescent state, done!!! + + // The task is in a read-side critical section, so set up its + // state so that it will awaken the grace-period kthread upon exit + // from that critical section. + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); return true; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0bbcbf398169..573fd78a7bca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -252,6 +252,7 @@ static void rcu_dynticks_eqs_enter(void) * critical sections, and we also must force ordering with the * next idle sojourn. */ + rcu_dynticks_task_trace_enter(); // Before ->dynticks update! seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -278,6 +279,7 @@ static void rcu_dynticks_eqs_exit(void) */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is now watching. Better not be in an extended quiescent state! + rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { @@ -349,6 +351,28 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) return snap != rcu_dynticks_snap(rdp); } +/* + * Return true if the referenced integer is zero while the specified + * CPU remains within a single extended quiescent state. + */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int snap; + + // If not quiescent, force back to earlier extended quiescent state. + snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | + RCU_DYNTICK_CTRL_CTR); + + smp_rmb(); // Order ->dynticks and *vp reads. + if (READ_ONCE(*vp)) + return false; // Non-zero, so report failure; + smp_rmb(); // Order *vp read and ->dynticks re-read. + + // If still in the same extended quiescent state, we are good! + return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); +} + /* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9dc2ec021da5..29ba79989802 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -454,6 +454,8 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +static void rcu_dynticks_task_trace_enter(void); +static void rcu_dynticks_task_trace_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 37e02812d18f..4cef7e3bca69 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2552,3 +2552,21 @@ static void rcu_dynticks_task_exit(void) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } + +/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ +static void rcu_dynticks_task_trace_enter(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = true; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} + +/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ +static void rcu_dynticks_task_trace_exit(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = false; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} -- cgit v1.2.3 From b38f57c1fe64276773b124dffb0a139cc32ab3cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Mar 2020 14:29:08 -0700 Subject: rcu-tasks: Allow rcu_read_unlock_trace() under scheduler locks The rcu_read_unlock_trace() can invoke rcu_read_unlock_trace_special(), which in turn can call wake_up(). Therefore, if any scheduler lock is held across a call to rcu_read_unlock_trace(), self-deadlock can occur. This commit therefore uses the irq_work facility to defer the wake_up() to a clean environment where no scheduler locks will be held. Reported-by: Steven Rostedt [ paulmck: Update #includes for m68k per kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 12 +++++++++++- kernel/rcu/update.c | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index a9e8ecb10860..dd311e93ed0f 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -729,6 +729,16 @@ void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); +/* + * This irq_work handler allows rcu_read_unlock_trace() to be invoked + * while the scheduler locks are held. + */ +static void rcu_read_unlock_iw(struct irq_work *iwp) +{ + wake_up(&trc_wait); +} +static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); + /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { @@ -742,7 +752,7 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_nesting, nesting); if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) - wake_up(&trc_wait); + irq_work_queue(&rcu_tasks_trace_iw); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c5799349ff31..b1f07a0e3a56 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -41,6 +41,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS -- cgit v1.2.3 From 81b4a7bc3b54b0b839dbf3d2b8c9a353ae910688 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 10:10:07 -0700 Subject: rcu-tasks: Disable CPU hotplug across RCU tasks trace scans This commit disables CPU hotplug across RCU tasks trace scans, which is a first step towards correctly recognizing idle tasks "running" on offline CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index dd311e93ed0f..361e17d57191 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -910,16 +910,16 @@ static void rcu_tasks_trace_pregp_step(void) { int cpu; - // Wait for CPU-hotplug paths to complete. - cpus_read_lock(); - cpus_read_unlock(); - // Allow for fast-acting IPIs. atomic_set(&trc_n_readers_need_end, 1); // There shouldn't be any old IPIs, but... for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the tasklist scan. + // This also waits for all readers in CPU-hotplug code paths. + cpus_read_lock(); } /* Do first-round processing for the specified task. */ @@ -935,6 +935,9 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, /* Do intermediate processing between task and holdout scans. */ static void rcu_tasks_trace_postscan(void) { + // Re-enable CPU hotplug now that the tasklist scan has completed. + cpus_read_unlock(); + // Wait for late-stage exiting tasks to finish exiting. // These might have passed the call to exit_tasks_rcu_finish(). synchronize_rcu(); @@ -979,6 +982,9 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, { struct task_struct *g, *t; + // Disable CPU hotplug across the holdout list scan. + cpus_read_lock(); + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { // If safe and needed, try to check the current task. if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && @@ -991,6 +997,10 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, else if (needreport) show_stalled_task_trace(t, firstreport); } + + // Re-enable CPU hotplug now that the holdout list scan has completed. + cpus_read_unlock(); + if (needreport) { if (firstreport) pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); -- cgit v1.2.3 From 7e3b70e0703b48e120c3f5e65498790341120fad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 11:24:58 -0700 Subject: rcu-tasks: Handle the running-offline idle-task special case The idle task corresponding to an offline CPU can appear to be running while that CPU is offline. This commit therefore adds checks for this situation, treating it as a quiescent state. Because the tasklist scan and the holdout-list scan now exclude CPU-hotplug operations, readers on the CPU-hotplug paths are still waited for. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 361e17d57191..e3a42d8f9eeb 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -818,16 +818,20 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) { int cpu = task_cpu(t); bool in_qs = false; + bool ofl = cpu_is_offline(cpu); if (task_curr(t)) { + WARN_ON_ONCE(ofl & !is_idle_task(t)); + // If no chance of heavyweight readers, do it the hard way. - if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) return false; // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. // However, we cannot safely change its state. - if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + if (!ofl && // Check for "running" idle tasks on offline CPUs. + !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return false; // No quiescent state, do it the hard way. in_qs = true; } else { -- cgit v1.2.3 From 9796e1ae7386ecf66eb234f7db7753845ebb2139 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 13:18:54 -0700 Subject: rcu-tasks: Make RCU tasks trace also wait for idle tasks This commit scans the CPUs, adding each CPU's idle task to the list of tasks that need quiescent states. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e3a42d8f9eeb..f272e8f16b81 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -15,7 +15,7 @@ struct rcu_tasks; typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); typedef void (*pregp_func_t)(void); typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); -typedef void (*postscan_func_t)(void); +typedef void (*postscan_func_t)(struct list_head *hop); typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); typedef void (*postgp_func_t)(struct rcu_tasks *rtp); @@ -331,7 +331,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) rcu_read_unlock(); set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); - rtp->postscan_func(); + rtp->postscan_func(&holdouts); /* * Each pass through the following loop scans the list of holdout @@ -415,7 +415,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } /* Processing between scanning taskslist and draining the holdout list. */ -void rcu_tasks_postscan(void) +void rcu_tasks_postscan(struct list_head *hop) { /* * Wait for tasks that are in the process of exiting. This @@ -936,9 +936,17 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, trc_wait_for_one_reader(t, hop); } -/* Do intermediate processing between task and holdout scans. */ -static void rcu_tasks_trace_postscan(void) +/* + * Do intermediate processing between task and holdout scans and + * pick up the idle tasks. + */ +static void rcu_tasks_trace_postscan(struct list_head *hop) { + int cpu; + + for_each_possible_cpu(cpu) + rcu_tasks_trace_pertask(idle_task(cpu), hop); + // Re-enable CPU hotplug now that the tasklist scan has completed. cpus_read_unlock(); -- cgit v1.2.3 From 40471509be3cb8c9c02aec1c316614cb96e6fe85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 13:34:34 -0700 Subject: rcu-tasks: Add rcu_dynticks_zero_in_eqs() effectiveness statistics This commit adds counts of the number of calls and number of successful calls to rcu_dynticks_zero_in_eqs(), which are printed at the end of rcutorture runs and at stall time. This allows evaluation of the effectiveness of rcu_dynticks_zero_in_eqs(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index f272e8f16b81..ce658831c759 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -725,6 +725,11 @@ DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); +// The number of detections of task quiescent state relying on +// heavyweight readers executing explicit memory barriers. +unsigned long n_heavy_reader_attempts; +unsigned long n_heavy_reader_updates; + void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); @@ -830,9 +835,11 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. // However, we cannot safely change its state. + n_heavy_reader_attempts++; if (!ofl && // Check for "running" idle tasks on offline CPUs. !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return false; // No quiescent state, do it the hard way. + n_heavy_reader_updates++; in_qs = true; } else { in_qs = likely(!t->trc_reader_nesting); @@ -1147,9 +1154,11 @@ core_initcall(rcu_spawn_tasks_trace_kthread); static void show_rcu_tasks_trace_gp_kthread(void) { - char buf[32]; + char buf[64]; - sprintf(buf, "N%d", atomic_read(&trc_n_readers_need_end)); + sprintf(buf, "N%d h:%lu/%lu", atomic_read(&trc_n_readers_need_end), + data_race(n_heavy_reader_updates), + data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } -- cgit v1.2.3 From edf3775f0ad66879796f594983163f672c4bf1a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 14:09:45 -0700 Subject: rcu-tasks: Add count for idle tasks on offline CPUs This commit adds a counter for the number of times the quiescent state was an idle task associated with an offline CPU, and prints this count at the end of rcutorture runs and at stall time. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce658831c759..0d1b5bf8317d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -729,6 +729,7 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); // heavyweight readers executing explicit memory barriers. unsigned long n_heavy_reader_attempts; unsigned long n_heavy_reader_updates; +unsigned long n_heavy_reader_ofl_updates; void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, @@ -840,6 +841,8 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return false; // No quiescent state, do it the hard way. n_heavy_reader_updates++; + if (ofl) + n_heavy_reader_ofl_updates++; in_qs = true; } else { in_qs = likely(!t->trc_reader_nesting); @@ -1156,7 +1159,8 @@ static void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%d h:%lu/%lu", atomic_read(&trc_n_readers_need_end), + sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); -- cgit v1.2.3 From 7e0669c3e9dec367ecb63062898c70c1c596b749 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Mar 2020 14:36:05 -0700 Subject: rcu-tasks: Add IPI failure count to statistics This commit adds a failure-return count for smp_call_function_single(), and adds this to the console messages for rcutorture writer stalls and at the end of rcutorture testing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 0d1b5bf8317d..0a580eff7c34 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @gp_start: Most recent grace-period start in jiffies. * @n_gps: Number of grace periods completed since boot. * @n_ipis: Number of IPIs sent to encourage grace periods to end. + * @n_ipis_fails: Number of IPI-send failures. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -51,6 +52,7 @@ struct rcu_tasks { unsigned long gp_start; unsigned long n_gps; unsigned long n_ipis; + unsigned long n_ipis_fails; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; @@ -290,12 +292,12 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - pr_info("%s: %s(%d) since %lu g:%lu i:%lu %c%c %s\n", + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, - tasks_gp_state_getname(rtp), - data_race(rtp->gp_state), + tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), - data_race(rtp->n_gps), data_race(rtp->n_ipis), + data_race(rtp->n_gps), + data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[!!data_race(rtp->cbs_head)], s); @@ -909,6 +911,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, trc_read_check_handler, t, 0)) { // Just in case there is some other reason for // failure than the target CPU being offline. + rcu_tasks_trace.n_ipis_fails++; per_cpu(trc_ipi_to_cpu, cpu) = false; t->trc_ipi_to_cpu = cpu; if (atomic_dec_and_test(&trc_n_readers_need_end)) { -- cgit v1.2.3 From 25246fc83155b254534ce579fb713828fb5e621a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 5 Apr 2020 20:49:13 -0700 Subject: rcu-tasks: Allow standalone use of TASKS_{TRACE_,}RCU This commit allows TASKS_TRACE_RCU to be used independently of TASKS_RCU and vice versa. [ paulmck: Fix conditional compilation per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 0a580eff7c34..ce23f6cc5043 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -244,27 +244,6 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) smp_mb(); /* Ensure others see full kthread. */ } -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -static void exit_tasks_rcu_finish_trace(struct task_struct *t); - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) -{ - struct task_struct *t = current; - - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); - preempt_enable(); - exit_tasks_rcu_finish_trace(t); -} - #ifndef CONFIG_TINY_RCU /* @@ -303,7 +282,9 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) s); } -#ifdef CONFIG_TASKS_RCU +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) //////////////////////////////////////////////////////////////////////// // @@ -374,6 +355,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) rtp->postgp_func(rtp); } +#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ + +#ifdef CONFIG_TASKS_RCU + //////////////////////////////////////////////////////////////////////// // // Simple variant of RCU whose quiescent states are voluntary context @@ -577,8 +562,29 @@ static void show_rcu_tasks_classic_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +{ + struct task_struct *t = current; + + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + preempt_enable(); + exit_tasks_rcu_finish_trace(t); +} + #else /* #ifdef CONFIG_TASKS_RCU */ static void show_rcu_tasks_classic_gp_kthread(void) { } +void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU @@ -1075,7 +1081,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) } /* Report any needed quiescent state for this exiting task. */ -void exit_tasks_rcu_finish_trace(struct task_struct *t) +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { WRITE_ONCE(t->trc_reader_checked, true); WARN_ON_ONCE(t->trc_reader_nesting); @@ -1170,7 +1176,7 @@ static void show_rcu_tasks_trace_gp_kthread(void) } #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ -void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ -- cgit v1.2.3 From e5a971d76d701dbff9e5dbaa84dc9e8c3081a867 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Apr 2020 12:10:28 -0700 Subject: ftrace: Use synchronize_rcu_tasks_rude() instead of ftrace_sync() This commit replaces the schedule_on_each_cpu(ftrace_sync) instances with synchronize_rcu_tasks_rude(). Suggested-by: Steven Rostedt Cc: Ingo Molnar [ paulmck: Make Kconfig adjustments noted by kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/trace/Kconfig | 1 + kernel/trace/ftrace.c | 17 +++-------------- 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 402eef84c859..ae69010d521a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -158,6 +158,7 @@ config FUNCTION_TRACER select CONTEXT_SWITCH_TRACER select GLOB select TASKS_RCU if PREEMPTION + select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 041694a1eb74..771eace959f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -160,17 +160,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_rcu(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ @@ -256,7 +245,7 @@ static void update_ftrace_function(void) * Make sure all CPUs see this. Yes this is slow, but static * tracing is slow and nasty to have enabled. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* Now all cpus are using the list ops. */ function_trace_op = set_function_trace_op; /* Make sure the function_trace_op is visible on all CPUs */ @@ -2932,7 +2921,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* * When the kernel is preeptive, tasks can be preempted @@ -5887,7 +5876,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); free_ftrace_hash(old_hash); } -- cgit v1.2.3 From 654db05cee8186cf9438d94ef32a4f9ffe964e57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Feb 2020 02:35:22 -0800 Subject: rcu: Use data_race() for RCU expedited CPU stall-warning prints Although the accesses used to determine whether or not an expedited stall should be printed are an integral part of the concurrency algorithm governing use of the corresponding variables, the values that are simply printed are ancillary. As such, it is best to use data_race() for these accesses in order to provide the greatest latitude in the use of KCSAN for the other accesses that are an integral part of the algorithm. This commit therefore changes the relevant uses of READ_ONCE() to data_race(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..e1a7986f15b4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } -- cgit v1.2.3 From 88375825171c7de5f1e68ac6fd5d35d3b831da3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Mar 2020 19:00:52 -0700 Subject: rcu: When GP kthread is starved, tag idle threads as false positives If the grace-period kthread is starved, idle threads' extended quiescent states are not reported. These idle threads thus wrongly appear to be blocking the current grace period. This commit therefore tags such idle threads as probable false positives when the grace-period kthread is being starved. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..4da41a613ece 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -299,6 +299,16 @@ static const char *gp_state_getname(short gs) return gp_state_names[gs]; } +/* Is the RCU grace-period kthread being starved of CPU time? */ +static bool rcu_is_gp_kthread_starving(unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -313,6 +323,7 @@ static const char *gp_state_getname(short gs) static void print_cpu_stall_info(int cpu) { unsigned long delta; + bool falsepositive; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; @@ -333,7 +344,9 @@ static void print_cpu_stall_info(int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + falsepositive = rcu_is_gp_kthread_starving(NULL) && + rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -345,8 +358,9 @@ static void print_cpu_stall_info(int cpu) rcu_dynticks_snap(rdp) & 0xfff, rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); + data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz, + falsepositive ? " (false positive?)" : ""); } /* Complain about starvation of grace-period kthread. */ @@ -355,8 +369,7 @@ static void rcu_check_gp_kthread_starvation(void) struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { + if (rcu_is_gp_kthread_starving(&j)) { pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), @@ -364,6 +377,7 @@ static void rcu_check_gp_kthread_starvation(void) gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); wake_up_process(gpk); -- cgit v1.2.3 From 33b2b93bd831fc0e994654cef3d046c713e3b55e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Apr 2020 14:12:07 -0700 Subject: rcu: Remove self-stack-trace when all quiescent states seen When all quiescent states have been seen, it is normally the grace-period kthread that is in trouble. Although the existing stack trace from the current CPU might possibly provide useful information, experience indicates that there is too much noise for this to be worthwhile. This commit therefore removes this stack trace from the output. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4da41a613ece..535762b07543 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -440,8 +440,6 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_state.name, j - gpa, j, gpa, READ_ONCE(jiffies_till_next_fqs), rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); } } /* Rewrite if needed in case of slow consoles. */ -- cgit v1.2.3 From 3b2a47398552938d2ae0091f35eb3658a52a0769 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2020 16:30:35 -0700 Subject: rcutorture: Add KCSAN stubs This commit adds stubs for KCSAN's data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros to allow code using these macros to move ahead. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5453bd557f43..7e2ea0c57433 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -- cgit v1.2.3 From c9527bebb017b891d1a2bbb96217bd5225488a0e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Feb 2020 13:41:02 -0800 Subject: rcutorture: Mark data-race potential for rcu_barrier() test statistics The n_barrier_successes, n_barrier_attempts, and n_rcu_torture_barrier_error variables are updated (without access markings) by the main rcu_barrier() test kthread, and accessed (also without access markings) by the rcu_torture_stats() kthread. This of course can result in KCSAN complaints. Because the accesses are in diagnostic prints, this commit uses data_race() to excuse the diagnostic prints from the data race. If this were to ever cause bogus statistics prints (for example, due to store tearing), any misleading information would be disambiguated by the presence or absence of an rcutorture splat. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to the mild consequences of the failure, namely a confusing rcutorture console message. Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e2ea0c57433..d0345d14e22a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1456,9 +1456,9 @@ rcu_torture_stats_print(void) atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld\n", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); + data_race(n_barrier_successes), + data_race(n_barrier_attempts), + data_race(n_rcu_torture_barrier_error)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || -- cgit v1.2.3 From 52785b6ae8eded7ac99d65c92d989b702e5b4376 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 17 Apr 2020 02:58:37 +0000 Subject: kcsan: Use GFP_ATOMIC under spin lock A spin lock is held in insert_report_filterlist(), so the krealloc() should use GFP_ATOMIC. This commit therefore makes this change. Reviewed-by: Marco Elver Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 1a08664a7fab..023e49c58d55 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -230,7 +230,7 @@ static ssize_t insert_report_filterlist(const char *func) /* initial allocation */ report_filterlist.addrs = kmalloc_array(report_filterlist.size, - sizeof(unsigned long), GFP_KERNEL); + sizeof(unsigned long), GFP_ATOMIC); if (report_filterlist.addrs == NULL) { ret = -ENOMEM; goto out; @@ -240,7 +240,7 @@ static ssize_t insert_report_filterlist(const char *func) size_t new_size = report_filterlist.size * 2; unsigned long *new_addrs = krealloc(report_filterlist.addrs, - new_size * sizeof(unsigned long), GFP_KERNEL); + new_size * sizeof(unsigned long), GFP_ATOMIC); if (new_addrs == NULL) { /* leave filterlist itself untouched */ -- cgit v1.2.3 From 8c1b2bf16d5944cd5c3a8a72e24ed9e22360c1af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:34 +0200 Subject: bpf, cgroup: Remove unused exports Except for a few of the networking hooks called from modular ipv4 or ipv6 code, all of hooks are just called from guaranteed to be built-in code. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/20200424064338.538313-2-hch@lst.de --- kernel/bpf/cgroup.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4d748c5785bc..fc7c7002fd37 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1054,7 +1054,6 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return !allow; } -EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -1207,7 +1206,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, return ret == 1 ? 0 : -EPERM; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, @@ -1312,7 +1310,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, char __user *optval, @@ -1399,7 +1396,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, -- cgit v1.2.3 From 182e073f68a080a29920f6dca796ccf4806b0329 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Sat, 18 Apr 2020 00:40:04 +0800 Subject: cpu/hotplug: Fix a typo in comment "broadacasted"->"broadcasted" Signed-off-by: Ethon Paul Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200417164008.6541-1-ethp@qq.com --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..6a02d4434cf7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -432,7 +432,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each - * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); -- cgit v1.2.3 From 9d2161bed4e39ef7a5e5f18f69c4a57d001051b9 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:37:04 -0400 Subject: audit: log audit netlink multicast bind and unbind Log information about programs connecting to and disconnecting from the audit netlink multicast socket. This is needed so that during investigations a security officer can tell who or what had access to the audit trail. This helps to meet the FAU_SAR.2 requirement for Common Criteria. Here is the systemd startup event: type=PROCTITLE msg=audit(2020-04-22 10:10:21.787:10) : proctitle=/init type=SYSCALL msg=audit(2020-04-22 10:10:21.787:10) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x19 a1=0x555f4aac7e90 a2=0xc a3=0x7ffcb792ff44 items=0 ppid=0 pid=1 auid=unset uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=(none) ses=unset comm=systemd exe=/usr/lib/systemd/systemd subj=kernel key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 10:10:21.787:10) : pid=1 uid=root auid=unset tty=(none) ses=unset subj=kernel comm=systemd exe=/usr/lib/systemd/systemd nl-mcgrp=1 op=connect res=yes And events from the test suite that just uses close(): type=PROCTITLE msg=audit(2020-04-22 11:47:08.501:442) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:47:08.501:442) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x7 a1=0x563004378760 a2=0xc a3=0x0 items=0 ppid=815 pid=818 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:47:08.501:442) : pid=818 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=connect res=yes type=UNKNOWN[1335] msg=audit(2020-04-22 11:47:08.501:443) : pid=818 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=disconnect res=yes And the events from the test suite using setsockopt with NETLINK_DROP_MEMBERSHIP: type=PROCTITLE msg=audit(2020-04-22 11:39:53.291:439) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:39:53.291:439) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x7 a1=0x5560877c2d20 a2=0xc a3=0x0 items=0 ppid=772 pid=775 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:39:53.291:439) : pid=775 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=connect res=yes type=PROCTITLE msg=audit(2020-04-22 11:39:53.292:440) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:39:53.292:440) : arch=x86_64 syscall=setsockopt success=yes exit=0 a0=0x7 a1=SOL_NETLINK a2=0x2 a3=0x7ffc8366f000 items=0 ppid=772 pid=775 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:39:53.292:440) : pid=775 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=disconnect res=yes Please see the upstream issue tracker at https://github.com/linux-audit/audit-kernel/issues/28 With the feature description at https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Multicast-Socket-Join-Part The testsuite support is at https://github.com/rgbriggs/audit-testsuite/compare/ghak28-mcast-part-join https://github.com/linux-audit/audit-testsuite/pull/93 And the userspace support patch is at https://github.com/linux-audit/audit-userspace/pull/114 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 622c30246d19..e33460e01b3b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1530,20 +1530,60 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_unlock(); } +/* Log information about who is connecting to the audit multicast socket */ +static void audit_log_multicast(int group, const char *op, int err) +{ + const struct cred *cred; + struct tty_struct *tty; + char comm[sizeof(current->comm)]; + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); + if (!ab) + return; + + cred = current_cred(); + tty = audit_get_tty(); + audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", + task_pid_nr(current), + from_kuid(&init_user_ns, cred->uid), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(current)); + audit_put_tty(tty); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_d_path_exe(ab, current->mm); /* exe= */ + audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); + audit_log_end(ab); +} + /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(struct net *net, int group) +static int audit_multicast_bind(struct net *net, int group) { + int err = 0; + if (!capable(CAP_AUDIT_READ)) - return -EPERM; + err = -EPERM; + audit_log_multicast(group, "connect", err); + return err; +} - return 0; +static void audit_multicast_unbind(struct net *net, int group) +{ + audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, - .bind = audit_bind, + .bind = audit_multicast_bind, + .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; -- cgit v1.2.3 From 6b03d1304a32dc8450c7516000a0fe07bef7c446 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 19 Apr 2020 06:35:02 -0500 Subject: proc: Ensure we see the exit of each process tid exactly once When the thread group leader changes during exec and the old leaders thread is reaped proc_flush_pid will flush the dentries for the entire process because the leader still has it's original pid. Fix this by exchanging the pids in an rcu safe manner, and wrapping the code to do that up in a helper exchange_tids. When I removed switch_exec_pids and introduced this behavior in d73d65293e3e ("[PATCH] pidhash: kill switch_exec_pids") there really was nothing that cared as flushing happened with the cached dentry and de_thread flushed both of them on exec. This lack of fully exchanging pids became a problem a few months later when I introduced 48e6484d4902 ("[PATCH] proc: Rewrite the proc dentry flush on exit optimization"). Which overlooked the de_thread case was no longer swapping pids, and I was looking up proc dentries by task->pid. The current behavior isn't properly a bug as everything in proc will continue to work correctly just a little bit less efficiently. Fix this just so there are no little surprise corner cases waiting to bite people. -- Oleg points out this could be an issue in next_tgid in proc where has_group_leader_pid is called, and reording some of the assignments should fix that. -- Oleg points out this will break the 10 year old hack in __exit_signal.c > /* > * This can only happen if the caller is de_thread(). > * FIXME: this is the temporary hack, we should teach > * posix-cpu-timers to handle this case correctly. > */ > if (unlikely(has_group_leader_pid(tsk))) > posix_cpu_timers_exit_group(tsk); The code in next_tgid has been changed to use PIDTYPE_TGID, and the posix cpu timers code has been fixed so it does not need the 10 year old hack, so this should be safe to merge now. Link: https://lore.kernel.org/lkml/87h7x3ajll.fsf_-_@x220.int.ebiederm.org/ Acked-by: Linus Torvalds Acked-by: Oleg Nesterov Fixes: 48e6484d4902 ("[PATCH] proc: Rewrite the proc dentry flush on exit optimization"). Signed-off-by: Eric W. Biederman --- kernel/pid.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index c835b844aca7..6d5d0a5bda82 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -363,6 +363,25 @@ void change_pid(struct task_struct *task, enum pid_type type, attach_pid(task, type); } +void exchange_tids(struct task_struct *left, struct task_struct *right) +{ + struct pid *pid1 = left->thread_pid; + struct pid *pid2 = right->thread_pid; + struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; + struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + + /* Swap the single entry tid lists */ + hlists_swap_heads_rcu(head1, head2); + + /* Swap the per task_struct pid */ + rcu_assign_pointer(left->thread_pid, pid2); + rcu_assign_pointer(right->thread_pid, pid1); + + /* Swap the cached value */ + WRITE_ONCE(left->pid, pid_nr(pid2)); + WRITE_ONCE(right->pid, pid_nr(pid1)); +} + /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) -- cgit v1.2.3 From 8feebc6713cd78cbba8c4e2a6d92299669052026 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Apr 2020 14:32:54 -0500 Subject: posix-cpu-timer: Tidy up group_leader logic in lookup_task Replace has_group_leader_pid with thread_group_leader. Years ago Oleg suggested changing thread_group_leader to has_group_leader_pid to handle races. Looking at the code then and now I don't see how it ever helped. Especially as then the code really did need to be the thread_group_leader. Today it doesn't make a difference if thread_group_leader races with de_thread as the task returned from lookup_task in the non-thread case is just used to find values in task->signal. Since the races with de_thread have never been handled revert has_group_header_pid to thread_group_leader for clarity. Update the comment in lookup_task to remove implementation details that are no longer true and to mention task->signal instead of task->sighand, as the relevant cpu timer details are all in task->signal. Ref: 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") Ref: c0deae8c9587 ("posix-cpu-timers: Rcu_read_lock/unlock protect find_task_by_vpid call") Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2fd3b3fa68bf..e4051e417bcb 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -69,12 +69,8 @@ static struct task_struct *lookup_task(const pid_t pid, bool thread, if (gettime) { /* * For clock_gettime(PROCESS) the task does not need to be - * the actual group leader. tsk->sighand gives + * the actual group leader. task->signal gives * access to the group's clock. - * - * Timers need the group leader because they take a - * reference on it and store the task pointer until the - * timer is destroyed. */ return (p == current || thread_group_leader(p)) ? p : NULL; } @@ -82,7 +78,7 @@ static struct task_struct *lookup_task(const pid_t pid, bool thread, /* * For processes require that p is group leader. */ - return has_group_leader_pid(p) ? p : NULL; + return thread_group_leader(p) ? p : NULL; } static struct task_struct *__get_task_for_clock(const clockid_t clock, -- cgit v1.2.3 From c7f5194054e103d18d267385b866b5b90511a425 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 28 Apr 2020 13:00:39 -0500 Subject: posix-cpu-timer: Unify the now redundant code in lookup_task Now that both !thread paths through lookup_task call thread_group_leader, unify them into the single test at the end of lookup_task. This unification just makes it clear what is happening in the gettime special case of lookup_task. Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e4051e417bcb..b7f972fb115e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -66,14 +66,13 @@ static struct task_struct *lookup_task(const pid_t pid, bool thread, if (thread) return same_thread_group(p, current) ? p : NULL; - if (gettime) { - /* - * For clock_gettime(PROCESS) the task does not need to be - * the actual group leader. task->signal gives - * access to the group's clock. - */ - return (p == current || thread_group_leader(p)) ? p : NULL; - } + /* + * For clock_gettime(PROCESS) the task does not need to be + * the actual group leader. task->signal gives + * access to the group's clock. + */ + if (gettime && (p == current)) + return p; /* * For processes require that p is group leader. -- cgit v1.2.3 From c4dad0aab3fca0c1f0baa4cc84b6ec91b7ebf426 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:39:28 -0400 Subject: audit: tidy and extend netfilter_cfg x_tables NETFILTER_CFG record generation was inconsistent for x_tables and ebtables configuration changes. The call was needlessly messy and there were supporting records missing at times while they were produced when not requested. Simplify the logging call into a new audit_log_nfcfg call. Honour the audit_enabled setting while more consistently recording information including supporting records by tidying up dummy checks. Add an op= field that indicates the operation being performed (register or replace). Here is the enhanced sample record: type=NETFILTER_CFG msg=audit(1580905834.919:82970): table=filter family=2 entries=83 op=replace Generate audit NETFILTER_CFG records on ebtables table registration. Previously this was being done for x_tables registration and replacement operations and ebtables table replacement only. See: https://github.com/linux-audit/audit-kernel/issues/25 See: https://github.com/linux-audit/audit-kernel/issues/35 See: https://github.com/linux-audit/audit-kernel/issues/43 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 814406a35db1..705beac0ce29 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -130,6 +130,16 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; +struct audit_nfcfgop_tab { + enum audit_nfcfgop op; + const char *s; +}; + +const struct audit_nfcfgop_tab audit_nfcfgs[] = { + { AUDIT_XT_OP_REGISTER, "register" }, + { AUDIT_XT_OP_REPLACE, "replace" }, +}; + static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; @@ -2542,6 +2552,20 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } +void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, + enum audit_nfcfgop op) +{ + struct audit_buffer *ab; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); + if (!ab) + return; + audit_log_format(ab, "table=%s family=%u entries=%u op=%s", + name, af, nentries, audit_nfcfgs[op].s); + audit_log_end(ab); +} +EXPORT_SYMBOL_GPL(__audit_log_nfcfg); + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; -- cgit v1.2.3 From a45d88530b2552ad5ea0da18861600b4ecc9d0c7 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:39:29 -0400 Subject: netfilter: add audit table unregister actions Audit the action of unregistering ebtables and x_tables. See: https://github.com/linux-audit/audit-kernel/issues/44 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 705beac0ce29..d281c18d1771 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -136,8 +136,9 @@ struct audit_nfcfgop_tab { }; const struct audit_nfcfgop_tab audit_nfcfgs[] = { - { AUDIT_XT_OP_REGISTER, "register" }, - { AUDIT_XT_OP_REPLACE, "replace" }, + { AUDIT_XT_OP_REGISTER, "register" }, + { AUDIT_XT_OP_REPLACE, "replace" }, + { AUDIT_XT_OP_UNREGISTER, "unregister" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) -- cgit v1.2.3 From f9d041271cf44ca02eed0cc82e1a6d8c814c53ed Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:05 -0700 Subject: bpf: Refactor bpf_link update handling Make bpf_link update support more generic by making it into another bpf_link_ops methods. This allows generic syscall handling code to be agnostic to various conditionally compiled features (e.g., the case of CONFIG_CGROUP_BPF). This also allows to keep link type-specific code to remain static within respective code base. Refactor existing bpf_cgroup_link code and take advantage of this. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-2-andriin@fb.com --- kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 11 ++++------- kernel/cgroup/cgroup.c | 27 --------------------------- 3 files changed, 32 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bf634959885c..da6e48e802b2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -557,8 +557,9 @@ found: * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, - struct bpf_prog *new_prog) +static int __cgroup_bpf_replace(struct cgroup *cgrp, + struct bpf_cgroup_link *link, + struct bpf_prog *new_prog) { struct list_head *progs = &cgrp->bpf.progs[link->type]; struct bpf_prog *old_prog; @@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, return 0; } +static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_cgroup_link *cg_link; + int ret; + + cg_link = container_of(link, struct bpf_cgroup_link, link); + + mutex_lock(&cgroup_mutex); + /* link might have been auto-released by dying cgroup, so fail */ + if (!cg_link->cgroup) { + ret = -EINVAL; + goto out_unlock; + } + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); +out_unlock: + mutex_unlock(&cgroup_mutex); + return ret; +} + static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, @@ -811,6 +836,7 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .update_prog = cgroup_bpf_replace, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7626b8024471..f5358e1462eb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3645,13 +3645,10 @@ static int link_update(union bpf_attr *attr) goto out_put_progs; } -#ifdef CONFIG_CGROUP_BPF - if (link->ops == &bpf_cgroup_link_lops) { - ret = cgroup_bpf_replace(link, old_prog, new_prog); - goto out_put_progs; - } -#endif - ret = -EINVAL; + if (link->ops->update_prog) + ret = link->ops->update_prog(link, new_prog, old_prog); + else + ret = EINVAL; out_put_progs: if (old_prog) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..557a9b9d2244 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6508,33 +6508,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp, return ret; } -int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, - struct bpf_prog *new_prog) -{ - struct bpf_cgroup_link *cg_link; - int ret; - - if (link->ops != &bpf_cgroup_link_lops) - return -EINVAL; - - cg_link = container_of(link, struct bpf_cgroup_link, link); - - mutex_lock(&cgroup_mutex); - /* link might have been auto-released by dying cgroup, so fail */ - if (!cg_link->cgroup) { - ret = -EINVAL; - goto out_unlock; - } - if (old_prog && link->prog != old_prog) { - ret = -EPERM; - goto out_unlock; - } - ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); -out_unlock: - mutex_unlock(&cgroup_mutex); - return ret; -} - int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type) { -- cgit v1.2.3 From a3b80e1078943dc12553166fb08e258463dec013 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:06 -0700 Subject: bpf: Allocate ID for bpf_link Generate ID for each bpf_link using IDR, similarly to bpf_map and bpf_prog. bpf_link creation, initialization, attachment, and exposing to user-space through FD and ID is a complicated multi-step process, abstract it away through bpf_link_primer and bpf_link_prime(), bpf_link_settle(), and bpf_link_cleanup() internal API. They guarantee that until bpf_link is properly attached, user-space won't be able to access partially-initialized bpf_link either from FD or ID. All this allows to simplify bpf_link attachment and error handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-3-andriin@fb.com --- kernel/bpf/cgroup.c | 14 +++-- kernel/bpf/syscall.c | 143 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 104 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da6e48e802b2..1bdf37fca879 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -841,10 +841,10 @@ const struct bpf_link_ops bpf_cgroup_link_lops = { int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_cgroup_link *link; - struct file *link_file; struct cgroup *cgrp; - int err, link_fd; + int err; if (attr->link_create.flags) return -EINVAL; @@ -862,22 +862,20 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_cgroup; } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, BPF_F_ALLOW_MULTI); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_cgroup; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_cgroup: cgroup_put(cgrp); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f5358e1462eb..5439e05e3d25 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -42,6 +42,8 @@ static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); +static DEFINE_IDR(link_idr); +static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -2181,25 +2183,38 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog) +void bpf_link_init(struct bpf_link *link, + const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->id = 0; link->ops = ops; link->prog = prog; } +static void bpf_link_free_id(int id) +{ + if (!id) + return; + + spin_lock_bh(&link_idr_lock); + idr_remove(&link_idr, id); + spin_unlock_bh(&link_idr_lock); +} + /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper manages marking bpf_link as - * defunct, releases anon_inode file and puts reserved FD. + * anon_inode's release() call. This helper marksbpf_link as + * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt + * is not decremented, it's the responsibility of a calling code that failed + * to complete bpf_link initialization. */ -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link_primer *primer) { - link->prog = NULL; - fput(link_file); - put_unused_fd(link_fd); + primer->link->prog = NULL; + bpf_link_free_id(primer->id); + fput(primer->file); + put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) @@ -2210,6 +2225,7 @@ void bpf_link_inc(struct bpf_link *link) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bpf_link_free_id(link->id); if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); @@ -2275,9 +2291,11 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" + "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", link_type, + link->id, prog_tag, prog->aux->id); } @@ -2292,36 +2310,76 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; -int bpf_link_new_fd(struct bpf_link *link) +static int bpf_link_alloc_id(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); -} + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&link_idr_lock); + id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); + spin_unlock_bh(&link_idr_lock); + idr_preload_end(); -/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but - * instead of immediately installing fd in fdtable, just reserve it and - * return. Caller then need to either install it with fd_install(fd, file) or - * release with put_unused_fd(fd). - * This is useful for cases when bpf_link attachment/detachment are - * complicated and expensive operations and should be delayed until all the fd - * reservation and anon_inode creation succeeds. + return id; +} + +/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, + * reserving unused FD and allocating ID from link_idr. This is to be paired + * with bpf_link_settle() to install FD and ID and expose bpf_link to + * user-space, if bpf_link is successfully attached. If not, bpf_link and + * pre-allocated resources are to be freed with bpf_cleanup() call. All the + * transient state is passed around in struct bpf_link_primer. + * This is preferred way to create and initialize bpf_link, especially when + * there are complicated and expensive operations inbetween creating bpf_link + * itself and attaching it to BPF hook. By using bpf_link_prime() and + * bpf_link_settle() kernel code using bpf_link doesn't have to perform + * expensive (and potentially failing) roll back operations in a rare case + * that file, FD, or ID can't be allocated. */ -struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; - int fd; + int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - return ERR_PTR(fd); + return fd; file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { put_unused_fd(fd); - return file; + return PTR_ERR(file); } - *reserved_fd = fd; - return file; + id = bpf_link_alloc_id(link); + if (id < 0) { + put_unused_fd(fd); + fput(file); + return id; + } + + primer->link = link; + primer->file = file; + primer->fd = fd; + primer->id = id; + return 0; +} + +int bpf_link_settle(struct bpf_link_primer *primer) +{ + /* make bpf_link fetchable by ID */ + spin_lock_bh(&link_idr_lock); + primer->link->id = primer->id; + spin_unlock_bh(&link_idr_lock); + /* make bpf_link fetchable by FD */ + fd_install(primer->fd, primer->file); + /* pass through installed FD */ + return primer->fd; +} + +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -2367,9 +2425,9 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_tracing_link *link; - struct file *link_file; - int link_fd, err; + int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -2404,22 +2462,19 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) } bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_prog; } err = bpf_trampoline_link_prog(prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_prog; } - fd_install(link_fd, link_file); - return link_fd; - + return bpf_link_settle(&link_primer); out_put_prog: bpf_prog_put(prog); return err; @@ -2447,7 +2502,7 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } -static const struct bpf_link_ops bpf_raw_tp_lops = { +static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, }; @@ -2456,13 +2511,13 @@ static const struct bpf_link_ops bpf_raw_tp_lops = { static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { + struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int link_fd, err; + int err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2515,24 +2570,22 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); link->btp = btp; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_btp; } err = bpf_probe_register(link->btp, prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_btp; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); @@ -3464,7 +3517,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; - if (link->ops == &bpf_raw_tp_lops) { + if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; -- cgit v1.2.3 From 2d602c8cf40d65d4a7ac34fe18648d8778e6e594 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:07 -0700 Subject: bpf: Support GET_FD_BY_ID and GET_NEXT_ID for bpf_link Add support to look up bpf_link by ID and iterate over all existing bpf_links in the system. GET_FD_BY_ID code handles not-yet-ready bpf_link by checking that its ID hasn't been set to non-zero value yet. Setting bpf_link's ID is done as the very last step in finalizing bpf_link, together with installing FD. This approach allows users of bpf_link in kernel code to not worry about races between user-space and kernel code that hasn't finished attaching and initializing bpf_link. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-4-andriin@fb.com --- kernel/bpf/syscall.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5439e05e3d25..1c213a730502 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3713,6 +3713,48 @@ out_put_link: return ret; } +static int bpf_link_inc_not_zero(struct bpf_link *link) +{ + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; +} + +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd, err; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&link_idr_lock); + link = idr_find(&link_idr, id); + /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + if (link) { + if (link->id) + err = bpf_link_inc_not_zero(link); + else + err = -EAGAIN; + } else { + err = -ENOENT; + } + spin_unlock_bh(&link_idr_lock); + + if (err) + return err; + + fd = bpf_link_new_fd(link); + if (fd < 0) + bpf_link_put(link); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3830,6 +3872,13 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_UPDATE: err = link_update(&attr); break; + case BPF_LINK_GET_FD_BY_ID: + err = bpf_link_get_fd_by_id(&attr); + break; + case BPF_LINK_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &link_idr, &link_idr_lock); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From f2e10bff16a0fdd41ba278c84da9813700e356af Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:08 -0700 Subject: bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link Add ability to fetch bpf_link details through BPF_OBJ_GET_INFO_BY_FD command. Also enhance show_fdinfo to potentially include bpf_link type-specific information (similarly to obj_info). Also introduce enum bpf_link_type stored in bpf_link itself and expose it in UAPI. bpf_link_tracing also now will store and return bpf_attach_type. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-5-andriin@fb.com --- kernel/bpf/btf.c | 2 + kernel/bpf/cgroup.c | 43 +++++++++++++- kernel/bpf/syscall.c | 155 +++++++++++++++++++++++++++++++++++++++++++------- kernel/bpf/verifier.c | 2 + 4 files changed, 181 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d65c6912bdaf..a2cfba89a8e1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ @@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = { 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1bdf37fca879..5c0e964105ac 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -833,10 +833,48 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } -const struct bpf_link_ops bpf_cgroup_link_lops = { +static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + seq_printf(seq, + "cgroup_id:\t%llu\n" + "attach_type:\t%d\n", + cg_id, + cg_link->type); +} + +static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + info->cgroup.cgroup_id = cg_id; + info->cgroup.attach_type = cg_link->type; + return 0; +} + +static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, .update_prog = cgroup_bpf_replace, + .show_fdinfo = bpf_cgroup_link_show_fdinfo, + .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -858,7 +896,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = -ENOMEM; goto out_put_cgroup; } - bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, + prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1c213a730502..d23c04cbe14f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -51,9 +51,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* @@ -1548,9 +1550,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) @@ -2183,10 +2187,11 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->type = type; link->id = 0; link->ops = ops; link->prog = prog; @@ -2266,27 +2271,23 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } -#ifdef CONFIG_PROC_FS -static const struct bpf_link_ops bpf_raw_tp_lops; -static const struct bpf_link_ops bpf_tracing_link_lops; +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, +static const char *bpf_link_type_strs[] = { + [BPF_LINK_TYPE_UNSPEC] = "", +#include +}; +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE +#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - const char *link_type; - - if (link->ops == &bpf_raw_tp_lops) - link_type = "raw_tracepoint"; - else if (link->ops == &bpf_tracing_link_lops) - link_type = "tracing"; -#ifdef CONFIG_CGROUP_BPF - else if (link->ops == &bpf_cgroup_link_lops) - link_type = "cgroup"; -#endif - else - link_type = "unknown"; bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -2294,10 +2295,12 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", - link_type, + bpf_link_type_strs[link->type], link->id, prog_tag, prog->aux->id); + if (link->ops->show_fdinfo) + link->ops->show_fdinfo(link, m); } #endif @@ -2403,6 +2406,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; + enum bpf_attach_type attach_type; }; static void bpf_tracing_link_release(struct bpf_link *link) @@ -2418,9 +2422,33 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) kfree(tr_link); } +static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + seq_printf(seq, + "attach_type:\t%d\n", + tr_link->attach_type); +} + +static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + info->tracing.attach_type = tr_link->attach_type; + + return 0; +} + static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, + .show_fdinfo = bpf_tracing_link_show_fdinfo, + .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) @@ -2460,7 +2488,9 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog); + link->attach_type = prog->expected_attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -2502,9 +2532,56 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } +static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + + seq_printf(seq, + "tp_name:\t%s\n", + raw_tp_link->btp->tp->name); +} + +static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); + const char *tp_name = raw_tp_link->btp->tp->name; + u32 ulen = info->raw_tracepoint.tp_name_len; + size_t tp_len = strlen(tp_name); + + if (ulen && !ubuf) + return -EINVAL; + + info->raw_tracepoint.tp_name_len = tp_len + 1; + + if (!ubuf) + return 0; + + if (ulen >= tp_len + 1) { + if (copy_to_user(ubuf, tp_name, tp_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, tp_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, + .show_fdinfo = bpf_raw_tp_link_show_fdinfo, + .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -2570,7 +2647,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog); link->btp = btp; err = bpf_link_prime(&link->link, &link_primer); @@ -3366,6 +3444,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } +static int bpf_link_get_info_by_fd(struct bpf_link *link, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_link_info info; + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + + info.type = link->type; + info.id = link->id; + info.prog_id = link->prog->aux->id; + + if (link->ops->fill_link_info) { + err = link->ops->fill_link_info(link, &info); + if (err) + return err; + } + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -3390,6 +3504,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &bpf_link_fops) + err = bpf_link_get_info_by_fd(f.file->private_data, + attr, uattr); else err = -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 91728e0f27eb..2b337e32aa94 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* bpf_check() is a static code analyzer that walks eBPF program -- cgit v1.2.3 From 9bf7c32409354b4a2fa3207d369a22c8233f718c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 25 Apr 2020 18:38:54 -0500 Subject: posix-cpu-timers: Extend rcu_read_lock removing task_struct references Now that the code stores of pid references it is no longer necessary or desirable to take a reference on task_struct in __get_task_for_clock. Instead extend the scope of rcu_read_lock and remove the reference counting on struct task_struct entirely. Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 43 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index b7f972fb115e..91996dd089a4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -81,36 +81,36 @@ static struct task_struct *lookup_task(const pid_t pid, bool thread, } static struct task_struct *__get_task_for_clock(const clockid_t clock, - bool getref, bool gettime) + bool gettime) { const bool thread = !!CPUCLOCK_PERTHREAD(clock); const pid_t pid = CPUCLOCK_PID(clock); - struct task_struct *p; if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) return NULL; - rcu_read_lock(); - p = lookup_task(pid, thread, gettime); - if (p && getref) - get_task_struct(p); - rcu_read_unlock(); - return p; + return lookup_task(pid, thread, gettime); } static inline struct task_struct *get_task_for_clock(const clockid_t clock) { - return __get_task_for_clock(clock, true, false); + return __get_task_for_clock(clock, false); } static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) { - return __get_task_for_clock(clock, true, true); + return __get_task_for_clock(clock, true); } static inline int validate_clock_permissions(const clockid_t clock) { - return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; + int ret; + + rcu_read_lock(); + ret = __get_task_for_clock(clock, false) ? 0 : -EINVAL; + rcu_read_unlock(); + + return ret; } static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) @@ -368,15 +368,18 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) struct task_struct *tsk; u64 t; + rcu_read_lock(); tsk = get_task_for_clock_get(clock); - if (!tsk) + if (!tsk) { + rcu_read_unlock(); return -EINVAL; + } if (CPUCLOCK_PERTHREAD(clock)) t = cpu_clock_sample(clkid, tsk); else t = cpu_clock_sample_group(clkid, tsk, false); - put_task_struct(tsk); + rcu_read_unlock(); *tp = ns_to_timespec64(t); return 0; @@ -389,19 +392,19 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { - struct task_struct *p = get_task_for_clock(new_timer->it_clock); + struct task_struct *p; - if (!p) + rcu_read_lock(); + p = get_task_for_clock(new_timer->it_clock); + if (!p) { + rcu_read_unlock(); return -EINVAL; + } new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); - /* - * get_task_for_clock() took a reference on @p. Drop it as the timer - * holds a reference on the pid of @p. - */ - put_task_struct(p); + rcu_read_unlock(); return 0; } -- cgit v1.2.3 From fece98260f31292dd468925c7582065bf6193212 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Apr 2020 09:38:29 -0500 Subject: posix-cpu-timers: Replace cpu_timer_pid_type with clock_pid_type Taking a clock and returning a pid_type is a more general and a superset of taking a timer and returning a pid_type. Perform this generalization so that future changes may use this code on clocks as well as timers. Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 91996dd089a4..42f673974d71 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -113,14 +113,14 @@ static inline int validate_clock_permissions(const clockid_t clock) return ret; } -static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) +static inline enum pid_type clock_pid_type(const clockid_t clock) { - return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID; + return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID; } static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) { - return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer)); + return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock)); } /* @@ -403,7 +403,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); + new_timer->it.cpu.pid = get_task_pid(p, clock_pid_type(new_timer->it_clock)); rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 964987738b3fe557cb1ee37acb4e7f85e29b7cea Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Apr 2020 07:55:07 -0500 Subject: posix-cpu-timers: Replace __get_task_for_clock with pid_for_clock Now that the codes store references to pids instead of referendes to tasks. Looking up a task for a clock instead of looking up a struct pid makes the code more difficult to verify it is correct than necessary. In posix_cpu_timers_create get_task_pid can race with release_task for threads and return a NULL pid. As put_pid and cpu_timer_task_rcu handle NULL pids just fine the code works without problems but it is an extra case to consider and keep in mind while verifying and modifying the code. There are races with de_thread to consider that only don't apply because thread clocks are only allowed for threads in the same thread_group. So instead of leaving a burden for people making modification to the code in the future return a rcu protected struct pid for the clock instead. The logic for __get_task_for_pid and lookup_task has been folded into the new function pid_for_clock with the only change being the logic has been modified from working on a task to working on a pid that will be returned. In posix_cpu_clock_get instead of calling pid_for_clock checking the result and then calling pid_task to get the task. The result of pid_for_clock is fed directly into pid_task. This is safe because pid_task handles NULL pids. As such an extra error check was unnecessary. Instead of hiding the flag that enables the special clock_gettime handling, I have made the 3 callers just pass the flag in themselves. That is less code and seems just as simple to work with as the wrapper functions. Historically the clock_gettime special case of allowing a process clock to be found by the thread id did not even exist [33ab0fec3352] but Thomas Gleixner reports that he has found code that uses that functionality [55e8c8eb2c7b]. Link: https://lkml.kernel.org/r/87zhaxqkwa.fsf@nanos.tec.linutronix.de/ Ref: 33ab0fec3352 ("posix-timers: Consolidate posix_cpu_clock_get()") Ref: 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 75 +++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 42f673974d71..165117996ea0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -47,59 +47,44 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) /* * Functions for validating access to tasks. */ -static struct task_struct *lookup_task(const pid_t pid, bool thread, - bool gettime) +static struct pid *pid_for_clock(const clockid_t clock, bool gettime) { - struct task_struct *p; + const bool thread = !!CPUCLOCK_PERTHREAD(clock); + const pid_t upid = CPUCLOCK_PID(clock); + struct pid *pid; + + if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) + return NULL; /* * If the encoded PID is 0, then the timer is targeted at current * or the process to which current belongs. */ - if (!pid) - return thread ? current : current->group_leader; + if (upid == 0) + return thread ? task_pid(current) : task_tgid(current); - p = find_task_by_vpid(pid); - if (!p) - return p; + pid = find_vpid(upid); + if (!pid) + return NULL; - if (thread) - return same_thread_group(p, current) ? p : NULL; + if (thread) { + struct task_struct *tsk = pid_task(pid, PIDTYPE_PID); + return (tsk && same_thread_group(tsk, current)) ? pid : NULL; + } /* - * For clock_gettime(PROCESS) the task does not need to be - * the actual group leader. task->signal gives - * access to the group's clock. + * For clock_gettime(PROCESS) allow finding the process by + * with the pid of the current task. The code needs the tgid + * of the process so that pid_task(pid, PIDTYPE_TGID) can be + * used to find the process. */ - if (gettime && (p == current)) - return p; + if (gettime && (pid == task_pid(current))) + return task_tgid(current); /* - * For processes require that p is group leader. + * For processes require that pid identifies a process. */ - return thread_group_leader(p) ? p : NULL; -} - -static struct task_struct *__get_task_for_clock(const clockid_t clock, - bool gettime) -{ - const bool thread = !!CPUCLOCK_PERTHREAD(clock); - const pid_t pid = CPUCLOCK_PID(clock); - - if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) - return NULL; - - return lookup_task(pid, thread, gettime); -} - -static inline struct task_struct *get_task_for_clock(const clockid_t clock) -{ - return __get_task_for_clock(clock, false); -} - -static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) -{ - return __get_task_for_clock(clock, true); + return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL; } static inline int validate_clock_permissions(const clockid_t clock) @@ -107,7 +92,7 @@ static inline int validate_clock_permissions(const clockid_t clock) int ret; rcu_read_lock(); - ret = __get_task_for_clock(clock, false) ? 0 : -EINVAL; + ret = pid_for_clock(clock, false) ? 0 : -EINVAL; rcu_read_unlock(); return ret; @@ -369,7 +354,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) u64 t; rcu_read_lock(); - tsk = get_task_for_clock_get(clock); + tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock)); if (!tsk) { rcu_read_unlock(); return -EINVAL; @@ -392,18 +377,18 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { - struct task_struct *p; + struct pid *pid; rcu_read_lock(); - p = get_task_for_clock(new_timer->it_clock); - if (!p) { + pid = pid_for_clock(new_timer->it_clock, false); + if (!pid) { rcu_read_unlock(); return -EINVAL; } new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.pid = get_task_pid(p, clock_pid_type(new_timer->it_clock)); + new_timer->it.cpu.pid = get_pid(pid); rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 64d85290d79c0677edb5a8ee2295b36c022fa5df Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:52 +0200 Subject: bpf: Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH White-list map lookup for SOCKMAP/SOCKHASH from BPF. Lookup returns a pointer to a full socket and acquires a reference if necessary. To support it we need to extend the verifier to know that: (1) register storing the lookup result holds a pointer to socket, if lookup was done on SOCKMAP/SOCKHASH, and that (2) map lookup on SOCKMAP/SOCKHASH is a reference acquiring operation, which needs a corresponding reference release with bpf_sk_release. On sock_map side, lookup handlers exposed via bpf_map_ops now bump sk_refcnt if socket is reference counted. In turn, bpf_sk_select_reuseport, the only in-kernel user of SOCKMAP/SOCKHASH ops->map_lookup_elem, was updated to release the reference. Sockets fetched from a map can be used in the same way as ones returned by BPF socket lookup helpers, such as bpf_sk_lookup_tcp. In particular, they can be used with bpf_sk_assign to direct packets toward a socket on TC ingress path. Suggested-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-2-jakub@cloudflare.com --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2b337e32aa94..70ad009577f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -429,11 +429,30 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } -static bool is_acquire_function(enum bpf_func_id func_id) +static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp; + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_map_lookup_elem; +} + +static bool is_acquire_function(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC; + + if (func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp) + return true; + + if (func_id == BPF_FUNC_map_lookup_elem && + (map_type == BPF_MAP_TYPE_SOCKMAP || + map_type == BPF_MAP_TYPE_SOCKHASH)) + return true; + + return false; } static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -3934,7 +3953,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -3942,7 +3962,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4112,7 +4133,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) /* A reference acquiring function cannot acquire * another refcounted ptr. */ - if (is_acquire_function(func_id) && count) + if (may_be_acquire_function(func_id) && count) return false; /* We only support one arg being unreferenced at the moment, @@ -4623,7 +4644,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id)) { + } else if (is_acquire_function(func_id, meta.map_ptr)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) @@ -6532,12 +6553,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (is_null) { reg->type = SCALAR_VALUE; } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (reg->map_ptr->inner_map_meta) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else if (reg->map_ptr->map_type == - BPF_MAP_TYPE_XSKMAP) { + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; } else { reg->type = PTR_TO_MAP_VALUE; } -- cgit v1.2.3 From 449e14bfdb83bf772200840a7ac4dcc1d7cacf54 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 29 Apr 2020 15:21:58 +0200 Subject: bpf: Fix unused variable warning Hiding the only using of bpf_link_type_strs[] in an #ifdef causes an unused-variable warning: kernel/bpf/syscall.c:2280:20: error: 'bpf_link_type_strs' defined but not used [-Werror=unused-variable] 2280 | static const char *bpf_link_type_strs[] = { Move the definition into the same #ifdef. Fixes: f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200429132217.1294289-1-arnd@arndb.de --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d23c04cbe14f..c75b2dd2459c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2271,6 +2271,7 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) [_id] = #_name, @@ -2282,7 +2283,6 @@ static const char *bpf_link_type_strs[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; -- cgit v1.2.3 From 3c2214b6027ff37945799de717c417212e1a8c54 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 21 Apr 2020 12:34:55 -0400 Subject: padata: add separate cpuhp node for CPUHP_PADATA_DEAD Removing the pcrypt module triggers this: general protection fault, probably for non-canonical address 0xdead000000000122 CPU: 5 PID: 264 Comm: modprobe Not tainted 5.6.0+ #2 Hardware name: QEMU Standard PC RIP: 0010:__cpuhp_state_remove_instance+0xcc/0x120 Call Trace: padata_sysfs_release+0x74/0xce kobject_put+0x81/0xd0 padata_free+0x12/0x20 pcrypt_exit+0x43/0x8ee [pcrypt] padata instances wrongly use the same hlist node for the online and dead states, so __padata_free()'s second cpuhp remove call chokes on the node that the first poisoned. cpuhp multi-instance callbacks only walk forward in cpuhp_step->list and the same node is linked in both the online and dead lists, so the list corruption that results from padata_alloc() adding the node to a second list without removing it from the first doesn't cause problems as long as no instances are freed. Avoid the issue by giving each state its own node. Fixes: 894c9ef9780c ("padata: validate cpumask without removed CPU during offline") Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Herbert Xu --- kernel/padata.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index a6afa12fb75e..aae789896616 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -703,7 +703,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -718,7 +718,7 @@ static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -734,8 +734,9 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); - cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, + &pinst->cpu_dead_node); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node); #endif WARN_ON(!list_empty(&pinst->pslist)); @@ -939,9 +940,10 @@ static struct padata_instance *padata_alloc(const char *name, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, + &pinst->cpu_online_node); cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, - &pinst->node); + &pinst->cpu_dead_node); #endif put_online_cpus(); -- cgit v1.2.3 From 1dd694a1b72f69a3add938f4b5bfb4cf9914b133 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Apr 2020 12:19:04 +0200 Subject: remove the no longer needed pid_alive() check in __task_pid_nr_ns() Starting from 2c4704756cab ("pids: Move the pgrp and session pid pointers from task_struct to signal_struct") __task_pid_nr_ns() doesn't dereference task->group_leader, we can remove the pid_alive() check. pid_nr_ns() has to check pid != NULL anyway, pid_alive() just adds the unnecessary confusion. Signed-off-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Acked-by: Christian Brauner Signed-off-by: Eric W. Biederman --- kernel/pid.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 6d5d0a5bda82..f1496b757162 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -495,8 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; -- cgit v1.2.3 From 2ed6edd33a214bca02bd2b45e3fc3038a059436b Mon Sep 17 00:00:00 2001 From: Barret Rhoden Date: Tue, 14 Apr 2020 18:29:20 -0400 Subject: perf: Add cond_resched() to task_function_call() Under rare circumstances, task_function_call() can repeatedly fail and cause a soft lockup. There is a slight race where the process is no longer running on the cpu we targeted by the time remote_function() runs. The code will simply try again. If we are very unlucky, this will continue to fail, until a watchdog fires. This can happen in a heavily loaded, multi-core virtual machine. Reported-by: syzbot+bb4935a5c09b5ff79940@syzkaller.appspotmail.com Signed-off-by: Barret Rhoden Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200414222920.121401-1-brho@google.com --- kernel/events/core.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 52951e9e8e1b..80cf996a7f19 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -95,11 +95,11 @@ static void remote_function(void *data) * @info: the function call argument * * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly + * be on the current CPU, which just calls the function directly. This will + * retry due to any failures in smp_call_function_single(), such as if the + * task_cpu() goes offline concurrently. * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away + * returns @func return value or -ESRCH when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -112,11 +112,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) }; int ret; - do { - ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - if (!ret) - ret = data.ret; - } while (ret == -EAGAIN); + for (;;) { + ret = smp_call_function_single(task_cpu(p), remote_function, + &data, 1); + ret = !ret ? data.ret : -EAGAIN; + + if (ret != -EAGAIN) + break; + + cond_resched(); + } return ret; } -- cgit v1.2.3 From f080d93e1d419099a99d7473ed532289ca8dc717 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 14 Apr 2020 20:57:21 +0800 Subject: sched/debug: Fix trival print_task() format Ensure leave one space between state and task name. w/o patch: runnable tasks: S task PID tree-key switches prio wait Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200414125721.195801-1-xiexiuqi@huawei.com --- kernel/sched/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a562df57a86e..b3ac1c10b032 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), @@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n"); SEQ_printf(m, "-------------------------------------------------------" - "----------------------------------------------------\n"); + "------------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { -- cgit v1.2.3 From e98fa02c4f2ea4991dae422ac7e34d102d2f0599 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 10 Apr 2020 15:52:07 -0700 Subject: sched/fair: Eliminate bandwidth race between throttling and distribution There is a race window in which an entity begins throttling before quota is added to the pool, but does not finish throttling until after we have finished with distribute_cfs_runtime(). This entity is not observed by distribute_cfs_runtime() because it was not on the throttled list at the time that distribution was running. This race manifests as rare period-length statlls for such entities. Rather than heavy-weight the synchronization with the progress of distribution, we can fix this by aborting throttling if bandwidth has become available. Otherwise, we immediately add the entity to the throttled list so that it can be observed by a subsequent distribution. Additionally, we can remove the case of adding the throttled entity to the head of the throttled list, and simply always add to the tail. Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto its own pool of runtime. This means that if we do hit the !assign and distribute_running case, we know that distribution is about to end. Signed-off-by: Paul Turner Signed-off-by: Ben Segall Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200410225208.109717-2-joshdon@google.com --- kernel/sched/fair.c | 79 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..0c13a41bde81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4588,16 +4588,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -4609,13 +4609,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; } +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ @@ -4704,13 +4716,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; - bool empty; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4744,29 +4776,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) sub_nr_running(rq, task_delta); - cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_b->throttled_cfs_rq); - - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. - */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -5121,8 +5137,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) -- cgit v1.2.3 From ab93a4bc955b3980c699430bc0b633f0d8b607be Mon Sep 17 00:00:00 2001 From: Josh Don Date: Fri, 10 Apr 2020 15:52:08 -0700 Subject: sched/fair: Remove distribute_running from CFS bandwidth This is mostly a revert of commit: baa9be4ffb55 ("sched/fair: Fix throttle_list starvation with low CFS quota") The primary use of distribute_running was to determine whether to add throttled entities to the head or the tail of the throttled list. Now that we always add to the tail, we can remove this field. The other use of distribute_running is in the slack_timer, so that we don't start a distribution while one is already running. However, even in the event that this race occurs, it is fine to have two distributions running (especially now that distribute grabs the cfs_b->lock to determine remaining quota before assigning). Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Tested-by: Phil Auld Link: https://lkml.kernel.org/r/20200410225208.109717-3-joshdon@google.com --- kernel/sched/fair.c | 13 +------------ kernel/sched/sched.h | 1 - 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0c13a41bde81..3d6ce751abb6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4931,14 +4931,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* * This check is repeated as we release cfs_b->lock while we unthrottle. */ - while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - cfs_b->distribute_running = 1; + while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); } @@ -5052,10 +5050,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->slack_started = false; - if (cfs_b->distribute_running) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); @@ -5065,9 +5059,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - if (runtime) - cfs_b->distribute_running = 1; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) @@ -5076,7 +5067,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -5218,7 +5208,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; - cfs_b->distribute_running = 0; cfs_b->slack_started = false; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db3a57675ccf..7198683b2869 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -349,7 +349,6 @@ struct cfs_bandwidth { u8 idle; u8 period_active; - u8 distribute_running; u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; -- cgit v1.2.3 From 64297f2b03cc7a6d0184a435f1b296beca1bedd1 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Sat, 11 Apr 2020 17:20:20 +0800 Subject: sched/fair: Simplify the code of should_we_balance() We only consider group_balance_cpu() after there is no idle cpu. So, just do comparison before return at these two cases. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/245c792f0e580b3ca342ad61257f4c066ee0f84f.1586594833.git.rocking@linux.alibaba.com --- kernel/sched/fair.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3d6ce751abb6..63419c6d9641 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9413,7 +9413,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; + int cpu; /* * Ensure the balancing environment is consistent; can happen @@ -9434,18 +9434,12 @@ static int should_we_balance(struct lb_env *env) if (!idle_cpu(cpu)) continue; - balance_cpu = cpu; - break; + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; } - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; } /* -- cgit v1.2.3 From 586b58cac8b4683eb58a1446fbc399de18974e40 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 5 Mar 2020 23:06:57 +0100 Subject: exit: Move preemption fixup up, move blocking operations down With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in non-preemptible context look untidy; after the main oops, the kernel prints a "sleeping function called from invalid context" report because exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read() can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED) fixup. It looks like the same thing applies to profile_task_exit() and kcov_task_exit(). Fix it by moving the preemption fixup up and the calls to profile_task_exit() and kcov_task_exit() down. Fixes: 1dc0fffc48af ("sched/core: Robustify preemption leak checks") Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200305220657.46800-1-jannh@google.com --- kernel/exit.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ce2a75bc0ade..d56fe51bdf07 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -708,8 +708,12 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - profile_task_exit(tsk); - kcov_task_exit(tsk); + /* + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ WARN_ON(blk_needs_flush_plug(tsk)); @@ -727,6 +731,16 @@ void __noreturn do_exit(long code) */ set_fs(USER_DS); + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + profile_task_exit(tsk); + kcov_task_exit(tsk); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -744,13 +758,6 @@ void __noreturn do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); -- cgit v1.2.3 From 45da27732b0b9b7a04696653065d5e6037bc5ac0 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:04 +0100 Subject: sched/fair: find_idlest_group(): Remove unused sd_flag parameter The last use of that parameter was removed by commit 57abff067a08 ("sched/fair: Rework find_idlest_group()") Get rid of the parameter. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200415210512.805-2-valentin.schneider@arm.com --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 63419c6d9641..617ca44ed61b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5825,8 +5825,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag); +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5909,7 +5908,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -8681,8 +8680,7 @@ static bool update_pick_idlest(struct sched_group *idlest, * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; -- cgit v1.2.3 From 9818427c6270a9ce8c52c8621026fe9cebae0f92 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:05 +0100 Subject: sched/debug: Make sd->flags sysctl read-only Writing to the sysctl of a sched_domain->flags directly updates the value of the field, and goes nowhere near update_top_cache_domain(). This means that the cached domain pointers can end up containing stale data (e.g. the domain pointed to doesn't have the relevant flag set anymore). Explicit domain walks that check for flags will be affected by the write, but this won't be in sync with the cached pointers which will still point to the domains that were cached at the last sched_domain build. In other words, writing to this interface is playing a dangerous game. It could be made to trigger an update of the cached sched_domain pointers when written to, but this does not seem to be worth the trouble. Make it read-only. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200415210512.805-3-valentin.schneider@arm.com --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b3ac1c10b032..c6cc02a6aad0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ -- cgit v1.2.3 From e669ac8ab952df2f07dee1e1efbf40647d6de332 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:06 +0100 Subject: sched: Remove checks against SD_LOAD_BALANCE The SD_LOAD_BALANCE flag is set unconditionally for all domains in sd_init(). By making the sched_domain->flags syctl interface read-only, we have removed the last piece of code that could clear that flag - as such, it will now be always present. Rather than to keep carrying it along, we can work towards getting rid of it entirely. cpusets don't need it because they can make CPUs be attached to the NULL domain (e.g. cpuset with sched_load_balance=0), or to a partitioned root_domain, i.e. a sched_domain hierarchy that doesn't span the entire system (e.g. root cpuset with sched_load_balance=0 and sibling cpusets with sched_load_balance=1). isolcpus apply the same "trick": isolated CPUs are explicitly taken out of the sched_domain rebuild (using housekeeping_cpumask()), so they get the NULL domain treatment as well. Remove the checks against SD_LOAD_BALANCE. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200415210512.805-4-valentin.schneider@arm.com --- kernel/sched/fair.c | 14 ++------------ kernel/sched/topology.c | 28 +++++++++------------------- 2 files changed, 11 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 617ca44ed61b..4b959c0a7d7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6649,9 +6649,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_lock(); for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - break; - /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -9790,9 +9787,8 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; } if (likely(sd)) { @@ -9881,9 +9877,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -10472,9 +10465,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int continue_balancing = 1; u64 t0, domain_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); break; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..a9dc34a0ebc1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); - return -1; - } - printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); @@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | + if (sd->flags & (SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | @@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + pflags &= ~(SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } -- cgit v1.2.3 From 36c5bdc4387056af3840adb4478c752faeb9d15e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:07 +0100 Subject: sched/topology: Kill SD_LOAD_BALANCE That flag is set unconditionally in sd_init(), and no one checks for it anymore. Remove it. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200415210512.805-5-valentin.schneider@arm.com --- kernel/sched/topology.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a9dc34a0ebc1..1d7b446fac7d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1341,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl, .cache_nice_tries = 0, - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE + .flags = 1*SD_BALANCE_NEWIDLE | 1*SD_BALANCE_EXEC | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE -- cgit v1.2.3 From d91cecc156620ec75d94c55369509c807c3d07e6 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 21 Apr 2020 18:50:34 +0800 Subject: sched: Make newidle_balance() static again After Commit 6e2df0581f56 ("sched: Fix pick_next_task() vs 'change' pattern race"), there is no need to expose newidle_balance() as it is only used within fair.c file. Change this function back to static again. No functional change. Reported-by: kbuild test robot Suggested-by: Peter Zijlstra Signed-off-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/83cd3030b031ca5d646cd5e225be10e7a0fdd8f5.1587464698.git.yu.c.chen@intel.com --- kernel/sched/fair.c | 6 ++++-- kernel/sched/sched.h | 4 ---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b959c0a7d7b..c0216ef33c0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3873,6 +3873,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -4054,7 +4056,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -10414,7 +10416,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7198683b2869..978c6fac8cb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1503,14 +1503,10 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); - #else static inline void sched_ttwu_pending(void) { } -static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } - #endif /* CONFIG_SMP */ #include "stats.h" -- cgit v1.2.3 From 457d1f465778ccb5f14f7d7a62245e41d12a3804 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 21 Apr 2020 18:50:43 +0800 Subject: sched: Extract the task putting code from pick_next_task() Introduce a new function put_prev_task_balance() to do the balance when necessary, and then put previous task back to the run queue. This function is extracted from pick_next_task() to prepare for future usage by other type of task picking logic. No functional change. Suggested-by: Peter Zijlstra Signed-off-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/5a99860cf66293db58a397d6248bcb2eee326776.1587464698.git.yu.c.chen@intel.com --- kernel/sched/core.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..2e6ba9e301f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3899,6 +3899,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) schedstat_inc(this_rq()->sched_count); } +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + const struct sched_class *class; + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ @@ -3932,22 +3954,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: -#ifdef CONFIG_SMP - /* - * We must do the balancing pass before put_next_task(), such - * that when we release the rq->lock the task is in the same - * state as before we took rq->lock. - * - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) - break; - } -#endif - - put_prev_task(rq, prev); + put_prev_task_balance(rq, prev, rf); for_each_class(class) { p = class->pick_next_task(rq); -- cgit v1.2.3 From 5a6d6a6ccb5f48ca8cf7c6d64ff83fd9c7999390 Mon Sep 17 00:00:00 2001 From: Huaixin Chang Date: Mon, 20 Apr 2020 10:44:21 +0800 Subject: sched/fair: Refill bandwidth before scaling In order to prevent possible hardlockup of sched_cfs_period_timer() loop, loop count is introduced to denote whether to scale quota and period or not. However, scale is done between forwarding period timer and refilling cfs bandwidth runtime, which means that period timer is forwarded with old "period" while runtime is refilled with scaled "quota". Move do_sched_cfs_period_timer() before scaling to solve this. Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup") Signed-off-by: Huaixin Chang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200420024421.22442-3-changhuaixin@linux.alibaba.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0216ef33c0c..fac5b2f85247 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5159,6 +5159,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -5188,8 +5190,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; -- cgit v1.2.3 From f38f12d1e0811c0ee59260b2bdadedf99e16c4af Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 6 Apr 2020 15:47:50 +0800 Subject: sched/fair: Mark sched_init_granularity __init Function sched_init_granularity() is only called from __init functions, so mark it __init as well. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20200406074750.56533-1-songmuchun@bytedance.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fac5b2f85247..cd7fd7e2b579 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -191,7 +191,7 @@ static void update_sysctl(void) #undef SET_SYSCTL } -void sched_init_granularity(void) +void __init sched_init_granularity(void) { update_sysctl(); } -- cgit v1.2.3 From bf2c59fce4074e55d622089b34be3a6bc95484fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 17:40:33 -0400 Subject: sched/core: Fix illegal RCU from offline CPUs In the CPU-offline process, it calls mmdrop() after idle entry and the subsequent call to cpuhp_report_idle_dead(). Once execution passes the call to rcu_report_dead(), RCU is ignoring the CPU, which results in lockdep complaining when mmdrop() uses RCU from either memcg or debugobjects below. Fix it by cleaning up the active_mm state from BP instead. Every arch which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit() from AP. The only exception is parisc because it switches them to &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()), but the patch will still work there because it calls mmgrab(&init_mm) in smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu(). WARNING: suspicious RCU usage ----------------------------- kernel/workqueue.c:710 RCU or wq_pool_mutex should be held! other info that might help us debug this: RCU used illegally from offline CPU! Call Trace: dump_stack+0xf4/0x164 (unreliable) lockdep_rcu_suspicious+0x140/0x164 get_work_pool+0x110/0x150 __queue_work+0x1bc/0xca0 queue_work_on+0x114/0x120 css_release+0x9c/0xc0 percpu_ref_put_many+0x204/0x230 free_pcp_prepare+0x264/0x570 free_unref_page+0x38/0xf0 __mmdrop+0x21c/0x2c0 idle_task_exit+0x170/0x1b0 pnv_smp_cpu_kill_self+0x38/0x2e0 cpu_die+0x48/0x64 arch_cpu_idle_dead+0x30/0x50 do_idle+0x2f4/0x470 cpu_startup_entry+0x38/0x40 start_secondary+0x7a8/0xa80 start_secondary_resume+0x10/0x14 Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman (powerpc) Link: https://lkml.kernel.org/r/20200401214033.8448-1-cai@lca.pw --- kernel/cpu.c | 18 +++++++++++++++++- kernel/sched/core.c | 5 +++-- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..244d30544377 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3,6 +3,7 @@ * * This code is licenced under the GPL. */ +#include #include #include #include @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu) return bringup_wait_for_ap(cpu); } +static int finish_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + struct mm_struct *mm = idle->active_mm; + + /* + * idle_task_exit() will have switched to &init_mm, now + * clean up any remaining active_mm state. + */ + if (mm != &init_mm) + idle->active_mm = &init_mm; + mmdrop(mm); + return 0; +} + /* * Hotplug state machine related functions */ @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, - .teardown.single = NULL, + .teardown.single = finish_cpu, .cant_stop = true, }, /* Final state before CPU kills itself */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e6ba9e301f0..f6ae2629f4e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6197,13 +6197,14 @@ void idle_task_exit(void) struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); + BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; finish_arch_post_lock_switch(); } - mmdrop(mm); + + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } /* -- cgit v1.2.3 From 17c891ab349138e8d8a59ca2700f42ce8af96f4e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 21 Apr 2020 22:41:23 +0800 Subject: sched/fair: Use __this_cpu_read() in wake_wide() The code is executed with preemption(and interrupts) disabled, so it's safe to use __this_cpu_write(). Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421144123.33580-1-songmuchun@bytedance.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cd7fd7e2b579..46b7bd41573f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5718,7 +5718,7 @@ static int wake_wide(struct task_struct *p) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int factor = __this_cpu_read(sd_llc_size); if (master < slave) swap(master, slave); -- cgit v1.2.3 From b1d1779e5ef7a60b192b61fd97201f322e1e9303 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 23 Apr 2020 21:44:43 +0000 Subject: sched/core: Simplify sched_init() Currently root_task_group.shares and cfs_bandwidth are initialized for each online cpu, which not necessary. Let's take it out to do it only once. Signed-off-by: Wei Yang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200423214443.29994-1-richard.weiyang@gmail.com --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f6ae2629f4e1..b58efb1156eb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6597,6 +6597,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -6649,7 +6651,6 @@ void __init sched_init(void) init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* @@ -6671,7 +6672,6 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From 41cd780524674082b037e7c8461f90c5e42103f0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 3 Apr 2020 07:20:51 +0000 Subject: uaccess: Selectively open read or write user access When opening user access to only perform reads, only open read access. When opening user access to only perform writes, only open write access. Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2e73bc57125c2c6ab12a587586a4eed3a47105fc.1585898438.git.christophe.leroy@c-s.fr --- kernel/compat.c | 12 ++++++------ kernel/exit.c | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 843dd17e6078..b8d2800bb4b7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -199,7 +199,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_read_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -211,11 +211,11 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, } if (nr_compat_longs) unsafe_get_user(*mask, umask++, Efault); - user_access_end(); + user_read_access_end(); return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -228,7 +228,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_write_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -239,10 +239,10 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } if (nr_compat_longs) unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); - user_access_end(); + user_write_access_end(); return 0; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } diff --git a/kernel/exit.c b/kernel/exit.c index 389a88cb3081..2d97cbba512d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1557,7 +1557,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1566,10 +1566,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } @@ -1684,7 +1684,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1693,10 +1693,10 @@ COMPAT_SYSCALL_DEFINE5(waitid, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } #endif -- cgit v1.2.3 From db9ff6ecf6efa6ffc45bfd5dc8d5708cfe5e89cb Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Wed, 29 Apr 2020 17:26:48 +0800 Subject: audit: make symbol 'audit_nfcfgs' static Fix sparse warnings: kernel/auditsc.c:138:32: warning: symbol 'audit_nfcfgs' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Zheng Bin Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d281c18d1771..cfe3486e5f31 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -135,7 +135,7 @@ struct audit_nfcfgop_tab { const char *s; }; -const struct audit_nfcfgop_tab audit_nfcfgs[] = { +static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_XT_OP_REGISTER, "register" }, { AUDIT_XT_OP_REPLACE, "replace" }, { AUDIT_XT_OP_UNREGISTER, "unregister" }, -- cgit v1.2.3 From d46edd671a147032e22cfeb271a5734703093649 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:04 -0700 Subject: bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats. Typical userspace tools use kernel.bpf_stats_enabled as follows: 1. Enable kernel.bpf_stats_enabled; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Disable kernel.bpf_stats_enabled. The problem with this approach is that only one userspace tool can toggle this sysctl. If multiple tools toggle the sysctl at the same time, the measurement may be inaccurate. To fix this problem while keep backward compatibility, introduce a new bpf command BPF_ENABLE_STATS. On success, this command enables stats and returns a valid fd. BPF_ENABLE_STATS takes argument "type". Currently, only one type, BPF_STATS_RUN_TIME, is supported. We can extend the command to support other types of stats in the future. With BPF_ENABLE_STATS, user space tool would have the following flow: 1. Get a fd with BPF_ENABLE_STATS, and make sure it is valid; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Close the fd. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200430071506.1408910-2-songliubraving@fb.com --- kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 36 ++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c75b2dd2459c..4f34eecec9ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3872,6 +3872,60 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) return fd; } +DEFINE_MUTEX(bpf_stats_enabled_mutex); + +static int bpf_stats_release(struct inode *inode, struct file *file) +{ + mutex_lock(&bpf_stats_enabled_mutex); + static_key_slow_dec(&bpf_stats_enabled_key.key); + mutex_unlock(&bpf_stats_enabled_mutex); + return 0; +} + +static const struct file_operations bpf_stats_fops = { + .release = bpf_stats_release, +}; + +static int bpf_enable_runtime_stats(void) +{ + int fd; + + mutex_lock(&bpf_stats_enabled_mutex); + + /* Set a very high limit to avoid overflow */ + if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { + mutex_unlock(&bpf_stats_enabled_mutex); + return -EBUSY; + } + + fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); + if (fd >= 0) + static_key_slow_inc(&bpf_stats_enabled_key.key); + + mutex_unlock(&bpf_stats_enabled_mutex); + return fd; +} + +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type + +static int bpf_enable_stats(union bpf_attr *attr) +{ + + if (CHECK_ATTR(BPF_ENABLE_STATS)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (attr->enable_stats.type) { + case BPF_STATS_RUN_TIME: + return bpf_enable_runtime_stats(); + default: + break; + } + return -EINVAL; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3996,6 +4050,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &link_idr, &link_idr_lock); break; + case BPF_ENABLE_STATS: + err = bpf_enable_stats(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e961286d0e14..7adfe5dbce9d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,6 +201,40 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_BPF_SYSCALL +static int bpf_stats_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} +#endif + /* * /proc/sys support */ @@ -2549,7 +2583,7 @@ static struct ctl_table kern_table[] = { .data = &bpf_stats_enabled_key.key, .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_do_static_key, + .proc_handler = bpf_stats_handler, }, #endif #if defined(CONFIG_TREE_RCU) -- cgit v1.2.3 From 138c67677ff5ac0bce7131033c39d52a81e87a60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 1 May 2020 11:56:22 -0700 Subject: bpf: Fix use-after-free of bpf_link when priming half-fails If bpf_link_prime() succeeds to allocate new anon file, but then fails to allocate ID for it, link priming is considered to be failed and user is supposed ot be able to directly kfree() bpf_link, because it was never exposed to user-space. But at that point file already keeps a pointer to bpf_link and will eventually call bpf_link_release(), so if bpf_link was kfree()'d by caller, that would lead to use-after-free. Fix this by first allocating ID and only then allocating file. Adding ID to link_idr is ok, because link at that point still doesn't have its ID set, so no user-space process can create a new FD for it. Fixes: a3b80e107894 ("bpf: Allocate ID for bpf_link") Reported-by: syzbot+39b64425f91b5aab714d@syzkaller.appspotmail.com Suggested-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200501185622.3088964-1-andriin@fb.com --- kernel/bpf/syscall.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4f34eecec9ce..bb1ab7da6103 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2348,19 +2348,20 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) if (fd < 0) return fd; - file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } id = bpf_link_alloc_id(link); if (id < 0) { put_unused_fd(fd); - fput(file); return id; } + file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); + if (IS_ERR(file)) { + bpf_link_free_id(id); + put_unused_fd(fd); + return PTR_ERR(file); + } + primer->link = link; primer->file = file; primer->fd = fd; -- cgit v1.2.3 From f187b6974f6dfbeba4aafda972cc37f27d091b73 Mon Sep 17 00:00:00 2001 From: Sean Fu Date: Wed, 29 Apr 2020 12:04:13 +0800 Subject: workqueue: Use IS_ERR and PTR_ERR instead of PTR_ERR_OR_ZERO. Replace inline function PTR_ERR_OR_ZERO with IS_ERR and PTR_ERR to remove redundant parameter definitions and checks. Reduce code size. Before: text data bss dec hex filename 47510 5979 840 54329 d439 kernel/workqueue.o After: text data bss dec hex filename 47474 5979 840 54293 d415 kernel/workqueue.o Signed-off-by: Sean Fu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 891ccad5f271..ddf0537dce14 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4197,7 +4197,6 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; - int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; @@ -4208,10 +4207,9 @@ static int init_rescuer(struct workqueue_struct *wq) rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - ret = PTR_ERR_OR_ZERO(rescuer->task); - if (ret) { + if (IS_ERR(rescuer->task)) { kfree(rescuer); - return ret; + return PTR_ERR(rescuer->task); } wq->rescuer = rescuer; -- cgit v1.2.3 From 5447e8e01e101ba19fe5b7551f02d37367156f6b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 May 2020 16:07:12 +0200 Subject: sysctl: Fix unused function warning The newly added bpf_stats_handler function has the wrong #ifdef check around it, leading to an unused-function warning when CONFIG_SYSCTL is disabled: kernel/sysctl.c:205:12: error: unused function 'bpf_stats_handler' [-Werror,-Wunused-function] static int bpf_stats_handler(struct ctl_table *table, int write, Fix the check to match the reference. Fixes: d46edd671a14 ("bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS") Signed-off-by: Arnd Bergmann Signed-off-by: Alexei Starovoitov Reviewed-by: Luis Chamberlain Acked-by: Martin KaFai Lau Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200505140734.503701-1-arnd@arndb.de --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7adfe5dbce9d..17c7633d90fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,7 +201,7 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ -#ifdef CONFIG_BPF_SYSCALL +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3 From c3b3f52476412a3899f2c65b220075aceb18dd2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2020 12:12:53 +0200 Subject: signal: refactor copy_siginfo_to_user32 Factor out a copy_siginfo_to_external32 helper from copy_siginfo_to_user32 that fills out the compat_siginfo, but does so on a kernel space data structure. With that we can let architectures override copy_siginfo_to_user32 with their own implementations using copy_siginfo_to_external32. That allows moving the x32 SIGCHLD purely to x86 architecture code. As a nice side effect copy_siginfo_to_external32 also comes in handy for avoiding a set_fs() call in the coredump code later on. Contains improvements from Eric W. Biederman and Arnd Bergmann . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/signal.c | 106 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e58a6c619824..2bc9458d40bd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3235,94 +3235,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) } #ifdef CONFIG_COMPAT -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from, bool x32_ABI) -#endif +/** + * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo + * @to: compat siginfo destination + * @from: kernel siginfo source + * + * Note: This function does not work properly for the SIGCHLD on x32, but + * fortunately it doesn't have to. The only valid callers for this function are + * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. + * The latter does not care because SIGCHLD will never cause a coredump. + */ +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from) { - struct compat_siginfo new; - memset(&new, 0, sizeof(new)); + memset(to, 0, sizeof(*to)); - new.si_signo = from->si_signo; - new.si_errno = from->si_errno; - new.si_code = from->si_code; + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - new.si_tid = from->si_tid; - new.si_overrun = from->si_overrun; - new.si_int = from->si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - new.si_band = from->si_band; - new.si_fd = from->si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_addr_lsb = from->si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); + to->si_lower = ptr_to_compat(from->si_lower); + to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_pkey = from->si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_status = from->si_status; -#ifdef CONFIG_X86_X32_ABI - if (x32_ABI) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } else -#endif - { - new.si_utime = from->si_utime; - new.si_stime = from->si_stime; - } + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; break; case SIL_RT: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_int = from->si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - new.si_call_addr = ptr_to_compat(from->si_call_addr); - new.si_syscall = from->si_syscall; - new.si_arch = from->si_arch; + to->si_call_addr = ptr_to_compat(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; - return 0; } -- cgit v1.2.3 From 19acd03d95dad1f50d06f28179a1866fca431fed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Apr 2020 17:47:29 +0200 Subject: kcsan: Add __kcsan_{enable,disable}_current() variants The __kcsan_{enable,disable}_current() variants only call into KCSAN if KCSAN is enabled for the current compilation unit. Note: This is typically not what we want, as we usually want to ensure that even calls into other functions still have KCSAN disabled. These variants may safely be used in header files that are shared between regular kernel code and code that does not link the KCSAN runtime. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index a572aae61b98..a73a66cf79df 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -625,6 +625,13 @@ void kcsan_enable_current(void) } EXPORT_SYMBOL(kcsan_enable_current); +void kcsan_enable_current_nowarn(void) +{ + if (get_ctx()->disable_count-- == 0) + kcsan_disable_current(); +} +EXPORT_SYMBOL(kcsan_enable_current_nowarn); + void kcsan_nestable_atomic_begin(void) { /* -- cgit v1.2.3 From 565558558985b1d7cd43b21f18c1ad6b232788d0 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 30 Apr 2020 12:40:03 +0100 Subject: cpu/hotplug: Remove disable_nonboot_cpus() The single user could have called freeze_secondary_cpus() directly. Since this function was a source of confusion, remove it as it's just a pointless wrapper. While at it, rename enable_nonboot_cpus() to thaw_secondary_cpus() to preserve the naming symmetry. Done automatically via: git grep -l enable_nonboot_cpus | xargs sed -i 's/enable_nonboot_cpus/thaw_secondary_cpus/g' Signed-off-by: Qais Yousef Signed-off-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/20200430114004.17477-1-qais.yousef@arm.com --- kernel/cpu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6a02d4434cf7..d766929e0b7e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1376,8 +1376,8 @@ int __freeze_secondary_cpus(int primary, bool suspend) /* * Make sure the CPUs won't be enabled by someone else. We need to do - * this even in case of failure as all disable_nonboot_cpus() users are - * supposed to do enable_nonboot_cpus() on the failure path. + * this even in case of failure as all freeze_secondary_cpus() users are + * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; @@ -1385,15 +1385,15 @@ int __freeze_secondary_cpus(int primary, bool suspend) return error; } -void __weak arch_enable_nonboot_cpus_begin(void) +void __weak arch_thaw_secondary_cpus_begin(void) { } -void __weak arch_enable_nonboot_cpus_end(void) +void __weak arch_thaw_secondary_cpus_end(void) { } -void enable_nonboot_cpus(void) +void thaw_secondary_cpus(void) { int cpu, error; @@ -1405,7 +1405,7 @@ void enable_nonboot_cpus(void) pr_info("Enabling non-boot CPUs ...\n"); - arch_enable_nonboot_cpus_begin(); + arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); @@ -1418,7 +1418,7 @@ void enable_nonboot_cpus(void) pr_warn("Error taking CPU%d up: %d\n", cpu, error); } - arch_enable_nonboot_cpus_end(); + arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: -- cgit v1.2.3 From fb7fb84a0c4e8021ddecb157802d58241a3f1a40 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 30 Apr 2020 12:40:04 +0100 Subject: cpu/hotplug: Remove __freeze_secondary_cpus() The refactored function is no longer required as the codepaths that call freeze_secondary_cpus() are all suspend/resume related now. Signed-off-by: Qais Yousef Signed-off-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/20200430114004.17477-2-qais.yousef@arm.com --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d766929e0b7e..9f892144db6b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1327,7 +1327,7 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus) #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int __freeze_secondary_cpus(int primary, bool suspend) +int freeze_secondary_cpus(int primary) { int cpu, error = 0; @@ -1352,7 +1352,7 @@ int __freeze_secondary_cpus(int primary, bool suspend) if (cpu == primary) continue; - if (suspend && pm_wakeup_pending()) { + if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; -- cgit v1.2.3 From a13502073638a0b28d84099fa985971fe3287aee Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 6 May 2020 18:17:27 +0300 Subject: kgdb: Drop malformed kernel doc comment Kernel doc does not understand POD variables to be referred to. .../debug_core.c:73: warning: cannot understand function prototype: 'int kgdb_connected; ' Convert kernel doc to pure comment. Fixes: dc7d55270521 ("kgdb: core") Cc: Jason Wessel Signed-off-by: Andy Shevchenko Reviewed-by: Douglas Anderson Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2b7c9b67931d..2266ba27f27d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -67,9 +67,7 @@ static int kgdb_break_asap; struct debuggerinfo_struct kgdb_info[NR_CPUS]; -/** - * kgdb_connected - Is a host GDB connected to us? - */ +/* kgdb_connected - Is a host GDB connected to us? */ int kgdb_connected; EXPORT_SYMBOL_GPL(kgdb_connected); -- cgit v1.2.3 From 19a8ff956c5abaaedfae23b8a951dd2d725a2171 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Mar 2020 17:39:12 -0700 Subject: rcutorture: Add flag to produce non-busy-wait task stalls This commit aids testing of RCU task stall warning messages by adding an rcutorture.stall_cpu_block module parameter that results in the induced stall sleeping within the RCU read-side critical section. Spinning with interrupts disabled is still available via the rcutorture.stall_cpu_irqsoff module parameter, and specifying neither of these two module parameters will spin with preemption disabled. Note that sleeping (as opposed to preemption) results in additional complaints from RCU at context-switch time, so yet more testing. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d0345d14e22a..60dc36893aad 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -114,6 +114,7 @@ torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); +torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1548,6 +1549,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " + "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1556,6 +1558,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, + stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1611,6 +1614,7 @@ static int rcutorture_booster_init(unsigned int cpu) */ static int rcu_torture_stall(void *args) { + int idx; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); @@ -1622,21 +1626,22 @@ static int rcu_torture_stall(void *args) if (!kthread_should_stop()) { stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - rcu_read_lock(); + idx = cur_ops->readlock(); if (stall_cpu_irqsoff) local_irq_disable(); - else + else if (!stall_cpu_block) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", - smp_processor_id()); + raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ + if (stall_cpu_block) + schedule_timeout_uninterruptible(HZ); if (stall_cpu_irqsoff) local_irq_enable(); - else + else if (!stall_cpu_block) preempt_enable(); - rcu_read_unlock(); + cur_ops->readunlock(idx); pr_alert("rcu_torture_stall end.\n"); } torture_shutdown_absorb("rcu_torture_stall"); -- cgit v1.2.3 From 55b2dcf58700041d6f0b037a98619222c825f004 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Apr 2020 19:57:52 -0700 Subject: rcu: Allow rcutorture to starve grace-period kthread This commit provides an rcutorture.stall_gp_kthread module parameter to allow rcutorture to starve the grace-period kthread. This allows testing the code that detects such starvation. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 18 +++++++++++++++--- kernel/rcu/tree.c | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 00ddc92c5774..cdbc5f98f584 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -454,6 +454,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +void rcu_gp_set_torture_wait(int duration); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) @@ -471,6 +472,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ do { } while (0) #endif +static inline void rcu_gp_set_torture_wait(int duration) { } #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 60dc36893aad..3d47dca4d61c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -115,6 +115,8 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_gp_kthread, 0, + "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1623,7 +1625,17 @@ static int rcu_torture_stall(void *args) schedule_timeout_interruptible(stall_cpu_holdoff * HZ); VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } - if (!kthread_should_stop()) { + if (!kthread_should_stop() && stall_gp_kthread > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall"); + rcu_gp_set_torture_wait(stall_gp_kthread * HZ); + for (idx = 0; idx < stall_gp_kthread + 2; idx++) { + if (kthread_should_stop()) + break; + schedule_timeout_uninterruptible(HZ); + } + } + if (!kthread_should_stop() && stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall"); stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ idx = cur_ops->readlock(); @@ -1642,8 +1654,8 @@ static int rcu_torture_stall(void *args) else if (!stall_cpu_block) preempt_enable(); cur_ops->readunlock(idx); - pr_alert("rcu_torture_stall end.\n"); } + pr_alert("rcu_torture_stall end.\n"); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -1653,7 +1665,7 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - if (stall_cpu <= 0) + if (stall_cpu <= 0 && stall_gp_kthread <= 0) return 0; return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 156ac8d0418b..be7dde8807d0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1486,6 +1486,31 @@ static void rcu_gp_slow(int delay) schedule_timeout_uninterruptible(delay); } +static unsigned long sleep_duration; + +/* Allow rcutorture to stall the grace-period kthread. */ +void rcu_gp_set_torture_wait(int duration) +{ + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) + WRITE_ONCE(sleep_duration, duration); +} +EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); + +/* Actually implement the aforementioned wait. */ +static void rcu_gp_torture_wait(void) +{ + unsigned long duration; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) + return; + duration = xchg(&sleep_duration, 0UL); + if (duration > 0) { + pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); + schedule_timeout_uninterruptible(duration); + pr_alert("%s: Wait complete\n", __func__); + } +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1686,6 +1711,7 @@ static void rcu_gp_fqs_loop(void) rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ @@ -1834,6 +1860,7 @@ static int __noreturn rcu_gp_kthread(void *unused) swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init()) -- cgit v1.2.3 From afbc1574f1da13d2fd2b30a96090b37c5933f957 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 9 Apr 2020 19:42:38 +0800 Subject: rcutorture: Make rcu_fwds and rcu_fwd_emergency_stop static This commit fixes the following sparse warning: kernel/rcu/rcutorture.c:1695:16: warning: symbol 'rcu_fwds' was not declared. Should it be static? kernel/rcu/rcutorture.c:1696:6: warning: symbol 'rcu_fwd_emergency_stop' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3d47dca4d61c..c7b7594bd2d8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1721,8 +1721,8 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd *rcu_fwds; -bool rcu_fwd_emergency_stop; +static struct rcu_fwd *rcu_fwds; +static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { -- cgit v1.2.3 From 3c80b4024579150ddb8ddd6fd7b110ba192aca3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 15:37:12 -0700 Subject: rcutorture: Convert ULONG_CMP_LT() to time_before() This commit converts three ULONG_CMP_LT() invocations in rcutorture to time_before() to reflect the fact that they are comparing timestamps to the jiffies counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c7b7594bd2d8..fc961472dc8e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -848,7 +848,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { + while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) @@ -858,7 +858,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { + while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ @@ -929,7 +929,7 @@ rcu_torture_fqs(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { schedule_timeout_interruptible(1); } -- cgit v1.2.3 From 96ecee29b0b560662ec082ee9b6f2049f2a79090 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 3 May 2020 06:48:17 -0500 Subject: exec: Merge install_exec_creds into setup_new_exec The two functions are now always called one right after the other so merge them together to make future maintenance easier. Reviewed-by: Kees Cook Reviewed-by: Greg Ungerer Signed-off-by: "Eric W. Biederman" --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 633b4ae72ed5..169449b5e56b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12217,7 +12217,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_mutex held when called from - * install_exec_creds(). + * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { -- cgit v1.2.3 From dcf550e52f567cb7a421169d2522869f9188aca5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:43 -0500 Subject: livepatch: Disallow vmlinux.ko This is purely a theoretical issue, but if there were a module named vmlinux.ko, the livepatch relocation code wouldn't be able to distinguish between vmlinux-specific and vmlinux.o-specific KLP relocations. If CONFIG_LIVEPATCH is enabled, don't allow a module named vmlinux.ko. Suggested-by: Peter Zijlstra Signed-off-by: Josh Poimboeuf Acked-by: Miroslav Benes Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c3512e7e0801..40cfac8156fd 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1139,6 +1139,11 @@ int klp_module_coming(struct module *mod) if (WARN_ON(mod->state != MODULE_STATE_COMING)) return -EINVAL; + if (!strcmp(mod->name, "vmlinux")) { + pr_err("vmlinux.ko: invalid module name"); + return -EINVAL; + } + mutex_lock(&klp_mutex); /* * Each module has to know that klp_module_coming() -- cgit v1.2.3 From 7c8e2bdd5f0d990e2398ee3deafc626dd469fc2d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:44 -0500 Subject: livepatch: Apply vmlinux-specific KLP relocations early KLP relocations are livepatch-specific relocations which are applied to a KLP module's text or data. They exist for two reasons: 1) Unexported symbols: replacement functions often need to access unexported symbols (e.g. static functions), which "normal" relocations don't allow. 2) Late module patching: this is the ability for a KLP module to bypass normal module dependencies, such that the KLP module can be loaded *before* a to-be-patched module. This means that relocations which need to access symbols in the to-be-patched module might need to be applied to the KLP module well after it has been loaded. Non-late-patched KLP relocations are applied from the KLP module's init function. That usually works fine, unless the patched code wants to use alternatives, paravirt patching, jump tables, or some other special section which needs relocations. Then we run into ordering issues and crashes. In order for those special sections to work properly, the KLP relocations should be applied *before* the special section init code runs, such as apply_paravirt(), apply_alternatives(), or jump_label_apply_nops(). You might think the obvious solution would be to move the KLP relocation initialization earlier, but it's not necessarily that simple. The problem is the above-mentioned late module patching, for which KLP relocations can get applied well after the KLP module is loaded. To "fix" this issue in the past, we created .klp.arch sections: .klp.arch.{module}..altinstructions .klp.arch.{module}..parainstructions Those sections allow KLP late module patching code to call apply_paravirt() and apply_alternatives() after the module-specific KLP relocations (.klp.rela.{module}.{section}) have been applied. But that has a lot of drawbacks, including code complexity, the need for arch-specific code, and the (per-arch) danger that we missed some special section -- for example the __jump_table section which is used for jump labels. It turns out there's a simpler and more functional approach. There are two kinds of KLP relocation sections: 1) vmlinux-specific KLP relocation sections .klp.rela.vmlinux.{sec} These are relocations (applied to the KLP module) which reference unexported vmlinux symbols. 2) module-specific KLP relocation sections .klp.rela.{module}.{sec}: These are relocations (applied to the KLP module) which reference unexported or exported module symbols. Up until now, these have been treated the same. However, they're inherently different. Because of late module patching, module-specific KLP relocations can be applied very late, thus they can create the ordering headaches described above. But vmlinux-specific KLP relocations don't have that problem. There's nothing to prevent them from being applied earlier. So apply them at the same time as normal relocations, when the KLP module is being loaded. This means that for vmlinux-specific KLP relocations, we no longer have any ordering issues. vmlinux-referencing jump labels, alternatives, and paravirt patching will work automatically, without the need for the .klp.arch hacks. All that said, for module-specific KLP relocations, the ordering problems still exist and we *do* still need .klp.arch. Or do we? Stay tuned. Suggested-by: Peter Zijlstra Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 137 ++++++++++++++++++++++++++++++------------------ kernel/module.c | 10 ++-- 2 files changed, 92 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 40cfac8156fd..c02791e5c75b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -191,12 +191,12 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) +static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, + unsigned int symndx, Elf_Shdr *relasec) { int i, cnt, vmlinux, ret; char objname[MODULE_NAME_LEN]; char symname[KSYM_NAME_LEN]; - char *strtab = pmod->core_kallsyms.strtab; Elf_Rela *relas; Elf_Sym *sym; unsigned long sympos, addr; @@ -216,7 +216,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); + sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); @@ -246,54 +246,59 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) return 0; } -static int klp_write_object_relocations(struct module *pmod, - struct klp_object *obj) +/* + * At a high-level, there are two types of klp relocation sections: those which + * reference symbols which live in vmlinux; and those which reference symbols + * which live in other modules. This function is called for both types: + * + * 1) When a klp module itself loads, the module code calls this function to + * write vmlinux-specific klp relocations (.klp.rela.vmlinux.* sections). + * These relocations are written to the klp module text to allow the patched + * code/data to reference unexported vmlinux symbols. They're written as + * early as possible to ensure that other module init code (.e.g., + * jump_label_apply_nops) can access any unexported vmlinux symbols which + * might be referenced by the klp module's special sections. + * + * 2) When a to-be-patched module loads -- or is already loaded when a + * corresponding klp module loads -- klp code calls this function to write + * module-specific klp relocations (.klp.rela.{module}.* sections). These + * are written to the klp module text to allow the patched code/data to + * reference symbols which live in the to-be-patched module or one of its + * module dependencies. Exported symbols are supported, in addition to + * unexported symbols, in order to enable late module patching, which allows + * the to-be-patched module to be loaded and patched sometime *after* the + * klp module is loaded. + */ +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symndx, unsigned int secndx, + const char *objname) { - int i, cnt, ret = 0; - const char *objname, *secname; + int cnt, ret; char sec_objname[MODULE_NAME_LEN]; - Elf_Shdr *sec; + Elf_Shdr *sec = sechdrs + secndx; - if (WARN_ON(!klp_is_object_loaded(obj))) + /* + * Format: .klp.rela.sec_objname.section_name + * See comment in klp_resolve_symbols() for an explanation + * of the selected field width value. + */ + cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + sec_objname); + if (cnt != 1) { + pr_err("section %s has an incorrectly formatted name\n", + shstrtab + sec->sh_name); return -EINVAL; + } - objname = klp_is_module(obj) ? obj->name : "vmlinux"; - - /* For each klp relocation section */ - for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { - sec = pmod->klp_info->sechdrs + i; - secname = pmod->klp_info->secstrings + sec->sh_name; - if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) - continue; - - /* - * Format: .klp.rela.sec_objname.section_name - * See comment in klp_resolve_symbols() for an explanation - * of the selected field width value. - */ - cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); - if (cnt != 1) { - pr_err("section %s has an incorrectly formatted name\n", - secname); - ret = -EINVAL; - break; - } - - if (strcmp(objname, sec_objname)) - continue; - - ret = klp_resolve_symbols(sec, pmod); - if (ret) - break; + if (strcmp(objname ? objname : "vmlinux", sec_objname)) + return 0; - ret = apply_relocate_add(pmod->klp_info->sechdrs, - pmod->core_kallsyms.strtab, - pmod->klp_info->symndx, i, pmod); - if (ret) - break; - } + ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec); + if (ret) + return ret; - return ret; + return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); } /* @@ -730,6 +735,28 @@ void __weak arch_klp_init_object_loaded(struct klp_patch *patch, { } +int klp_apply_object_relocs(struct klp_patch *patch, struct klp_object *obj) +{ + int i, ret; + struct klp_modinfo *info = patch->mod->klp_info; + + for (i = 1; i < info->hdr.e_shnum; i++) { + Elf_Shdr *sec = info->sechdrs + i; + + if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) + continue; + + ret = klp_apply_section_relocs(patch->mod, info->sechdrs, + info->secstrings, + patch->mod->core_kallsyms.strtab, + info->symndx, i, obj->name); + if (ret) + return ret; + } + + return 0; +} + /* parts of the initialization that is done only when the object is loaded */ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_object *obj) @@ -738,18 +765,26 @@ static int klp_init_object_loaded(struct klp_patch *patch, int ret; mutex_lock(&text_mutex); - module_disable_ro(patch->mod); - ret = klp_write_object_relocations(patch->mod, obj); - if (ret) { - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); - return ret; + + if (klp_is_module(obj)) { + /* + * Only write module-specific relocations here + * (.klp.rela.{module}.*). vmlinux-specific relocations were + * written earlier during the initialization of the klp module + * itself. + */ + ret = klp_apply_object_relocs(patch, obj); + if (ret) { + module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + return ret; + } } arch_klp_init_object_loaded(patch, obj); - module_enable_ro(patch->mod, true); + module_enable_ro(patch->mod, true); mutex_unlock(&text_mutex); klp_for_each_func(obj, func) { diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..fdd9f6970e9a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2334,11 +2334,13 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) continue; - /* Livepatch relocation sections are applied by livepatch */ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) - continue; - - if (info->sechdrs[i].sh_type == SHT_REL) + err = klp_apply_section_relocs(mod, info->sechdrs, + info->secstrings, + info->strtab, + info->index.sym, i, + NULL); + else if (info->sechdrs[i].sh_type == SHT_REL) err = apply_relocate(info->sechdrs, info->strtab, info->index.sym, i, mod); else if (info->sechdrs[i].sh_type == SHT_RELA) -- cgit v1.2.3 From 1d05334d2899bd3ecdf01beb53f0a70884a7f471 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 10:24:45 -0500 Subject: livepatch: Remove .klp.arch After the previous patch, vmlinux-specific KLP relocations are now applied early during KLP module load. This means that .klp.arch sections are no longer needed for *vmlinux-specific* KLP relocations. One might think they're still needed for *module-specific* KLP relocations. If a to-be-patched module is loaded *after* its corresponding KLP module is loaded, any corresponding KLP relocations will be delayed until the to-be-patched module is loaded. If any special sections (.parainstructions, for example) rely on those relocations, their initializations (apply_paravirt) need to be done afterwards. Thus the apparent need for arch_klp_init_object_loaded() and its corresponding .klp.arch sections -- it allows some of the special section initializations to be done at a later time. But... if you look closer, that dependency between the special sections and the module-specific KLP relocations doesn't actually exist in reality. Looking at the contents of the .altinstructions and .parainstructions sections, there's not a realistic scenario in which a KLP module's .altinstructions or .parainstructions section needs to access a symbol in a to-be-patched module. It might need to access a local symbol or even a vmlinux symbol; but not another module's symbol. When a special section needs to reference a local or vmlinux symbol, a normal rela can be used instead of a KLP rela. Since the special section initializations don't actually have any real dependency on module-specific KLP relocations, .klp.arch and arch_klp_init_object_loaded() no longer have a reason to exist. So remove them. As Peter said much more succinctly: So the reason for .klp.arch was that .klp.rela.* stuff would overwrite paravirt instructions. If that happens you're doing it wrong. Those RELAs are core kernel, not module, and thus should've happened in .rela.* sections at patch-module loading time. Reverting this removes the two apply_{paravirt,alternatives}() calls from the late patching path, and means we don't have to worry about them when removing module_disable_ro(). [ jpoimboe: Rewrote patch description. Tweaked klp_init_object_loaded() error path. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c02791e5c75b..16632e75112a 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -729,12 +729,6 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } -/* Arches may override this to finish any remaining arch-specific tasks */ -void __weak arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) -{ -} - int klp_apply_object_relocs(struct klp_patch *patch, struct klp_object *obj) { int i, ret; @@ -764,10 +758,11 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; - mutex_lock(&text_mutex); - module_disable_ro(patch->mod); - if (klp_is_module(obj)) { + + mutex_lock(&text_mutex); + module_disable_ro(patch->mod); + /* * Only write module-specific relocations here * (.klp.rela.{module}.*). vmlinux-specific relocations were @@ -775,17 +770,13 @@ static int klp_init_object_loaded(struct klp_patch *patch, * itself. */ ret = klp_apply_object_relocs(patch, obj); - if (ret) { - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); - return ret; - } - } - arch_klp_init_object_loaded(patch, obj); + module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); + if (ret) + return ret; + } klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, -- cgit v1.2.3 From ca376a9374867d09ece6f61803764fb187201294 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:46 -0500 Subject: livepatch: Prevent module-specific KLP rela sections from referencing vmlinux symbols Prevent module-specific KLP rela sections from referencing vmlinux symbols. This helps prevent ordering issues with module special section initializations. Presumably such symbols are exported and normal relas can be used instead. Suggested-by: Peter Zijlstra Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 16632e75112a..f9ebb54affab 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -192,17 +192,20 @@ static int klp_find_object_symbol(const char *objname, const char *name, } static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, - unsigned int symndx, Elf_Shdr *relasec) + unsigned int symndx, Elf_Shdr *relasec, + const char *sec_objname) { - int i, cnt, vmlinux, ret; - char objname[MODULE_NAME_LEN]; - char symname[KSYM_NAME_LEN]; + int i, cnt, ret; + char sym_objname[MODULE_NAME_LEN]; + char sym_name[KSYM_NAME_LEN]; Elf_Rela *relas; Elf_Sym *sym; unsigned long sympos, addr; + bool sym_vmlinux; + bool sec_vmlinux = !strcmp(sec_objname, "vmlinux"); /* - * Since the field widths for objname and symname in the sscanf() + * Since the field widths for sym_objname and sym_name in the sscanf() * call are hard-coded and correspond to MODULE_NAME_LEN and * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN * and KSYM_NAME_LEN have the values we expect them to have. @@ -223,20 +226,33 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, return -EINVAL; } - /* Format: .klp.sym.objname.symname,sympos */ + /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, ".klp.sym.%55[^.].%127[^,],%lu", - objname, symname, &sympos); + sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", strtab + sym->st_name); return -EINVAL; } + sym_vmlinux = !strcmp(sym_objname, "vmlinux"); + + /* + * Prevent module-specific KLP rela sections from referencing + * vmlinux symbols. This helps prevent ordering issues with + * module special section initializations. Presumably such + * symbols are exported and normal relas can be used instead. + */ + if (!sec_vmlinux && sym_vmlinux) { + pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section", + sym_name); + return -EINVAL; + } + /* klp_find_object_symbol() treats a NULL objname as vmlinux */ - vmlinux = !strcmp(objname, "vmlinux"); - ret = klp_find_object_symbol(vmlinux ? NULL : objname, - symname, sympos, &addr); + ret = klp_find_object_symbol(sym_vmlinux ? NULL : sym_objname, + sym_name, sympos, &addr); if (ret) return ret; @@ -294,7 +310,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, if (strcmp(objname ? objname : "vmlinux", sec_objname)) return 0; - ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec); + ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname); if (ret) return ret; -- cgit v1.2.3 From d556e1be33320366272ec02f93f98d7f308479f1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:50 -0500 Subject: livepatch: Remove module_disable_ro() usage With arch_klp_init_object_loaded() gone, and apply_relocate_add() now using text_poke(), livepatch no longer needs to use module_disable_ro(). Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index f9ebb54affab..6b8b3c067be0 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -777,7 +777,6 @@ static int klp_init_object_loaded(struct klp_patch *patch, if (klp_is_module(obj)) { mutex_lock(&text_mutex); - module_disable_ro(patch->mod); /* * Only write module-specific relocations here @@ -787,7 +786,6 @@ static int klp_init_object_loaded(struct klp_patch *patch, */ ret = klp_apply_object_relocs(patch, obj); - module_enable_ro(patch->mod, true); mutex_unlock(&text_mutex); if (ret) -- cgit v1.2.3 From 0d9fbf78fefb421a3af97394ce80bba0db4f046a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:51 -0500 Subject: module: Remove module_disable_ro() module_disable_ro() has no more users. Remove it. Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- kernel/module.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index fdd9f6970e9a..3ba024afe379 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1997,19 +1997,6 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } -/* livepatching wants to disable read-only so it can frob module. */ -void module_disable_ro(const struct module *mod) -{ - if (!rodata_enabled) - return; - - frob_text(&mod->core_layout, set_memory_rw); - frob_rodata(&mod->core_layout, set_memory_rw); - frob_ro_after_init(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - frob_rodata(&mod->init_layout, set_memory_rw); -} - void module_enable_ro(const struct module *mod, bool after_init) { if (!rodata_enabled) -- cgit v1.2.3 From 5b384f933590a086ca9a0abdc2e55e41107ac440 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:52 -0500 Subject: x86/module: Use text_mutex in apply_relocate_add() Now that the livepatch code no longer needs the text_mutex for changing module permissions, move its usage down to apply_relocate_add(). Note the s390 version of apply_relocate_add() doesn't need to use the text_mutex because it already uses s390_kernel_write_lock, which accomplishes the same task. Signed-off-by: Josh Poimboeuf Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 6b8b3c067be0..96d2da14eb0d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -775,9 +775,6 @@ static int klp_init_object_loaded(struct klp_patch *patch, int ret; if (klp_is_module(obj)) { - - mutex_lock(&text_mutex); - /* * Only write module-specific relocations here * (.klp.rela.{module}.*). vmlinux-specific relocations were @@ -785,9 +782,6 @@ static int klp_init_object_loaded(struct klp_patch *patch, * itself. */ ret = klp_apply_object_relocs(patch, obj); - - mutex_unlock(&text_mutex); - if (ret) return ret; } -- cgit v1.2.3 From e6eff4376e2897c2e14b70d87bf7284cdb093830 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:53 -0500 Subject: module: Make module_enable_ro() static again Now that module_enable_ro() has no more external users, make it static again. Suggested-by: Jessica Yu Signed-off-by: Josh Poimboeuf Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3ba024afe379..a26343ea4d50 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1997,7 +1997,7 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } -void module_enable_ro(const struct module *mod, bool after_init) +static void module_enable_ro(const struct module *mod, bool after_init) { if (!rodata_enabled) return; @@ -2025,6 +2025,7 @@ static void module_enable_nx(const struct module *mod) #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } +static void module_enable_ro(const struct module *mod, bool after_init) {} #endif /* CONFIG_STRICT_MODULE_RWX */ static void module_enable_x(const struct module *mod) { -- cgit v1.2.3 From 6b0b0fa2bce61db790efc8070ae6e5696435b0a8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 2 May 2020 11:24:25 -0700 Subject: crypto: lib/sha1 - rename "sha" to "sha1" The library implementation of the SHA-1 compression function is confusingly called just "sha_transform()". Alongside it are some "SHA_" constants and "sha_init()". Presumably these are left over from a time when SHA just meant SHA-1. But now there are also SHA-2 and SHA-3, and moreover SHA-1 is now considered insecure and thus shouldn't be used. Therefore, rename these functions and constants to make it very clear that they are for SHA-1. Also add a comment to make it clear that these shouldn't be used. For the extra-misleadingly named "SHA_MESSAGE_BYTES", rename it to SHA1_BLOCK_SIZE and define it to just '64' rather than '(512/8)' so that it matches the same definition in . This prepares for merging into . Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- kernel/bpf/core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..14aa1f74dd10 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -262,10 +262,10 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA_DIGEST_WORDS]; - u32 ws[SHA_WORKSPACE_WORDS]; + u32 digest[SHA1_DIGEST_WORDS]; + u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; @@ -277,7 +277,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) if (!raw) return -ENOMEM; - sha_init(digest); + sha1_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation @@ -308,8 +308,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; - bsize = round_up(psize, SHA_MESSAGE_BYTES); - blocks = bsize / SHA_MESSAGE_BYTES; + bsize = round_up(psize, SHA1_BLOCK_SIZE); + blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); @@ -320,12 +320,12 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { - sha_transform(digest, todo, ws); - todo += SHA_MESSAGE_BYTES; + sha1_transform(digest, todo, ws); + todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; - for (i = 0; i < SHA_DIGEST_WORDS; i++) + for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); -- cgit v1.2.3 From 52782c92ac85c4e393eb4a903a62e6c24afa633f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 6 May 2020 12:09:34 -0400 Subject: kthread: save thread function It's handy to keep the kthread_fn just as a unique cookie to identify classes of kthreads. E.g. if you can verify that a given task is running your thread_fn, then you may know what sort of type kthread_data points to. We'll use this in nfsd to pass some information into the vfs. Note it will need kthread_data() exported too. Original-patch-by: Tejun Heo Signed-off-by: J. Bruce Fields --- kernel/kthread.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index bfbfa481be3a..b84fc7eec035 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -46,6 +46,7 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; @@ -152,6 +153,20 @@ bool kthread_freezable_should_stop(bool *was_frozen) } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); +/** + * kthread_func - return the function specified on kthread creation + * @task: kthread task in question + * + * Returns NULL if the task is not a kthread. + */ +void *kthread_func(struct task_struct *task) +{ + if (task->flags & PF_KTHREAD) + return to_kthread(task)->threadfn; + return NULL; +} +EXPORT_SYMBOL_GPL(kthread_func); + /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question @@ -164,6 +179,7 @@ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } +EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() @@ -244,6 +260,7 @@ static int kthread(void *_create) do_exit(-ENOMEM); } + self->threadfn = threadfn; self->data = data; init_completion(&self->exited); init_completion(&self->parked); -- cgit v1.2.3 From f2a8d52e0a4db968c346c4332630a71cba377567 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 5 May 2020 16:04:30 +0200 Subject: nsproxy: add struct nsset Add a simple struct nsset. It holds all necessary pieces to switch to a new set of namespaces without leaving a task in a half-switched state which we will make use of in the next patch. This patch switches the existing setns logic over without causing a change in setns() behavior. This brings setns() closer to how unshare() works(). The prepare_ns() function is responsible to prepare all necessary information. This has two reasons. First it minimizes dependencies between individual namespaces, i.e. all install handler can expect that all fields are properly initialized independent in what order they are called in. Second, this makes the code easier to maintain and easier to follow if it needs to be changed. The prepare_ns() helper will only be switched over to use a flags argument in the next patch. Here it will still use nstype as a simple integer argument which was argued would be clearer. I'm not particularly opinionated about this if it really helps or not. The struct nsset itself already contains the flags field since its name already indicates that it can contain information required by different namespaces. None of this should have functional consequences. Signed-off-by: Christian Brauner Reviewed-by: Serge Hallyn Cc: Eric W. Biederman Cc: Serge Hallyn Cc: Jann Horn Cc: Michael Kerrisk Cc: Aleksa Sarai Link: https://lore.kernel.org/r/20200505140432.181565-2-christian.brauner@ubuntu.com --- kernel/cgroup/namespace.c | 5 +-- kernel/nsproxy.c | 90 ++++++++++++++++++++++++++++++++++++++++------- kernel/pid_namespace.c | 5 +-- kernel/time/namespace.c | 5 +-- kernel/user_namespace.c | 8 ++--- kernel/utsname.c | 5 +-- 6 files changed, 93 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index b05f1dd58a62..812a61afd538 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) return container_of(ns, struct cgroup_namespace, ns); } -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ed9882108cd2..b7954fd60475 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -257,12 +258,79 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +static void put_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + + if (flags & CLONE_NEWUSER) + put_cred(nsset_cred(nsset)); + if (nsset->nsproxy) + free_nsproxy(nsset->nsproxy); +} + +static int prepare_nsset(int nstype, struct nsset *nsset) +{ + struct task_struct *me = current; + + nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs); + if (IS_ERR(nsset->nsproxy)) + return PTR_ERR(nsset->nsproxy); + + if (nstype == CLONE_NEWUSER) + nsset->cred = prepare_creds(); + else + nsset->cred = current_cred(); + if (!nsset->cred) + goto out; + + if (nstype == CLONE_NEWNS) + nsset->fs = me->fs; + + nsset->flags = nstype; + return 0; + +out: + put_nsset(nsset); + return -ENOMEM; +} + +/* + * This is the point of no return. There are just a few namespaces + * that do some actual work here and it's sufficiently minimal that + * a separate ns_common operation seems unnecessary for now. + * Unshare is doing the same thing. If we'll end up needing to do + * more in a given namespace or a helper here is ultimately not + * exported anymore a simple commit handler for each namespace + * should be added to ns_common. + */ +static void commit_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + struct task_struct *me = current; + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + /* transfer ownership */ + commit_creds(nsset_cred(nsset)); + nsset->cred = NULL; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + exit_sem(me); +#endif + + /* transfer ownership */ + switch_task_namespaces(me, nsset->nsproxy); + nsset->nsproxy = NULL; +} + SYSCALL_DEFINE2(setns, int, fd, int, nstype) { - struct task_struct *tsk = current; - struct nsproxy *new_nsproxy; struct file *file; struct ns_common *ns; + struct nsset nsset = {}; int err; file = proc_ns_fget(fd); @@ -274,20 +342,16 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) if (nstype && (ns->ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); - if (IS_ERR(new_nsproxy)) { - err = PTR_ERR(new_nsproxy); + err = prepare_nsset(ns->ops->type, &nsset); + if (err) goto out; - } - err = ns->ops->install(new_nsproxy, ns); - if (err) { - free_nsproxy(new_nsproxy); - goto out; + err = ns->ops->install(&nsset, ns); + if (!err) { + commit_nsset(&nsset); + perf_event_namespaces(current); } - switch_task_namespaces(tsk, new_nsproxy); - - perf_event_namespaces(tsk); + put_nsset(&nsset); out: fput(file); return err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..11db2bdbb41e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int pidns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 53bce347cd50..5d9fc22d836a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,8 +280,9 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +static int timens_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); int err; @@ -289,7 +290,7 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; timens_set_vvar_page(current, ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8eadadc478f9..87804e0371fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns) put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; @@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - cred = prepare_creds(); + cred = nsset_cred(nsset); if (!cred) - return -ENOMEM; + return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); - return commit_creds(cred); + return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) diff --git a/kernel/utsname.c b/kernel/utsname.c index f0e491193009..e488d0e2ab45 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns) put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int utsns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); -- cgit v1.2.3 From ae24345da54e452880808b011fa2d8a0bbd191ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:58:59 -0700 Subject: bpf: Implement an interface to register bpf_iter targets The target can call bpf_iter_reg_target() to register itself. The needed information: target: target name seq_ops: the seq_file operations for the target init_seq_private target callback to initialize seq_priv during file open fini_seq_private target callback to clean up seq_priv during file release seq_priv_size: the private_data size needed by the seq_file operations The target name represents a target which provides a seq_ops for iterating objects. The target can provide two callback functions, init_seq_private and fini_seq_private, called during file open/release time. For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net name space needs to be setup properly during file open and released properly during file release. Function bpf_iter_unreg_target() is also implemented to unregister a particular target. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175859.2474669-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/bpf_iter.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f2d7be596966..6a8b0febd3f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c new file mode 100644 index 000000000000..5a8119d17d14 --- /dev/null +++ b/kernel/bpf/bpf_iter.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include + +struct bpf_iter_target_info { + struct list_head list; + const char *target; + const struct seq_operations *seq_ops; + bpf_iter_init_seq_priv_t init_seq_private; + bpf_iter_fini_seq_priv_t fini_seq_private; + u32 seq_priv_size; +}; + +static struct list_head targets = LIST_HEAD_INIT(targets); +static DEFINE_MUTEX(targets_mutex); + +int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +{ + struct bpf_iter_target_info *tinfo; + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + return -ENOMEM; + + tinfo->target = reg_info->target; + tinfo->seq_ops = reg_info->seq_ops; + tinfo->init_seq_private = reg_info->init_seq_private; + tinfo->fini_seq_private = reg_info->fini_seq_private; + tinfo->seq_priv_size = reg_info->seq_priv_size; + INIT_LIST_HEAD(&tinfo->list); + + mutex_lock(&targets_mutex); + list_add(&tinfo->list, &targets); + mutex_unlock(&targets_mutex); + + return 0; +} + +void bpf_iter_unreg_target(const char *target) +{ + struct bpf_iter_target_info *tinfo; + bool found = false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (!strcmp(target, tinfo->target)) { + list_del(&tinfo->list); + kfree(tinfo); + found = true; + break; + } + } + mutex_unlock(&targets_mutex); + + WARN_ON(found == false); +} -- cgit v1.2.3 From 15d83c4d7cef5c067a8b075ce59e97df4f60706e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:00 -0700 Subject: bpf: Allow loading of a bpf_iter program A bpf_iter program is a tracing program with attach type BPF_TRACE_ITER. The load attribute attach_btf_id is used by the verifier against a particular kernel function, which represents a target, e.g., __bpf_iter__bpf_map for target bpf_map which is implemented later. The program return value must be 0 or 1 for now. 0 : successful, except potential seq_file buffer overflow which is handled by seq_file reader. 1 : request to restart the same object In the future, other return values may be used for filtering or teminating the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175900.2474947-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 21 +++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5a8119d17d14..dec182d8395a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -12,6 +12,7 @@ struct bpf_iter_target_info { bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 btf_id; /* cached value */ }; static struct list_head targets = LIST_HEAD_INIT(targets); @@ -57,3 +58,38 @@ void bpf_iter_unreg_target(const char *target) WARN_ON(found == false); } + +static void cache_btf_id(struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + tinfo->btf_id = prog->aux->attach_btf_id; +} + +bool bpf_iter_prog_supported(struct bpf_prog *prog) +{ + const char *attach_fname = prog->aux->attach_func_name; + u32 prog_btf_id = prog->aux->attach_btf_id; + const char *prefix = BPF_ITER_FUNC_PREFIX; + struct bpf_iter_target_info *tinfo; + int prefix_len = strlen(prefix); + bool supported = false; + + if (strncmp(attach_fname, prefix, prefix_len)) + return false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { + supported = true; + break; + } + if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + cache_btf_id(tinfo, prog); + supported = true; + break; + } + } + mutex_unlock(&targets_mutex); + + return supported; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 70ad009577f8..d725ff7d11db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type != BPF_TRACE_ITER) + return 0; + break; default: return 0; } @@ -10481,6 +10485,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct btf_func_model fmodel; int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; @@ -10622,6 +10627,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_ITER: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + ret = btf_distill_func_proto(&env->log, btf, t, + tname, &fmodel); + return ret; default: if (!prog_extension) return -EINVAL; -- cgit v1.2.3 From de4e05cac46d206f9090051ef09930514bff73e4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:01 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_CREATE Given a bpf program, the step to create an anonymous bpf iterator is: - create a bpf_iter_link, which combines bpf program and the target. In the future, there could be more information recorded in the link. A link_fd will be returned to the user space. - create an anonymous bpf iterator with the given link_fd. The bpf_iter_link can be pinned to bpffs mount file system to create a file based bpf iterator as well. The benefit to use of bpf_iter_link: - using bpf link simplifies design and implementation as bpf link is used for other tracing bpf programs. - for file based bpf iterator, bpf_iter_link provides a standard way to replace underlying bpf programs. - for both anonymous and free based iterators, bpf link query capability can be leveraged. The patch added support of tracing/iter programs for BPF_LINK_CREATE. A new link type BPF_LINK_TYPE_ITER is added to facilitate link querying. Currently, only prog_id is needed, so there is no additional in-kernel show_fdinfo() and fill_link_info() hook is needed for BPF_LINK_TYPE_ITER link. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175901.2475084-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 14 ++++++++++++ 2 files changed, 76 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dec182d8395a..03f5832909db 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -15,6 +15,11 @@ struct bpf_iter_target_info { u32 btf_id; /* cached value */ }; +struct bpf_iter_link { + struct bpf_link link; + struct bpf_iter_target_info *tinfo; +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); @@ -93,3 +98,60 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } + +static void bpf_iter_link_release(struct bpf_link *link) +{ +} + +static void bpf_iter_link_dealloc(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + kfree(iter_link); +} + +static const struct bpf_link_ops bpf_iter_link_lops = { + .release = bpf_iter_link_release, + .dealloc = bpf_iter_link_dealloc, +}; + +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_iter_target_info *tinfo; + struct bpf_iter_link *link; + bool existed = false; + u32 prog_btf_id; + int err; + + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + prog_btf_id = prog->aux->attach_btf_id; + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog_btf_id) { + existed = true; + break; + } + } + mutex_unlock(&targets_mutex); + if (!existed) + return -ENOENT; + + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + link->tinfo = tinfo; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + return bpf_link_settle(&link_primer); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bb1ab7da6103..6ffe2d8fb6c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_ITER: + return BPF_PROG_TYPE_TRACING; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3729,6 +3731,15 @@ err_put: return err; } +static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + if (attr->link_create.attach_type == BPF_TRACE_ITER && + prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + + return -EINVAL; +} + #define BPF_LINK_CREATE_LAST_FIELD link_create.flags static int link_create(union bpf_attr *attr) { @@ -3765,6 +3776,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = tracing_bpf_link_attach(attr, prog); + break; default: ret = -EINVAL; } -- cgit v1.2.3 From 2057c92bc927f09b22f5609425eb37d7e782f484 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:02 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_UPDATE Added BPF_LINK_UPDATE support for tracing/iter programs. This way, a file based bpf iterator, which holds a reference to the link, can have its bpf program updated without creating new files. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175902.2475262-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 03f5832909db..0542a243b78c 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -23,6 +23,9 @@ struct bpf_iter_link { static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); +/* protect bpf_iter_link changes */ +static DEFINE_MUTEX(link_mutex); + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -111,9 +114,37 @@ static void bpf_iter_link_dealloc(struct bpf_link *link) kfree(iter_link); } +static int bpf_iter_link_replace(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + int ret = 0; + + mutex_lock(&link_mutex); + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + + if (link->prog->type != new_prog->type || + link->prog->expected_attach_type != new_prog->expected_attach_type || + link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { + ret = -EINVAL; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&link_mutex); + return ret; +} + static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, + .update_prog = bpf_iter_link_replace, }; int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) -- cgit v1.2.3 From fd4f12bc38c3ad9107169e7c9e6e7f81d93dda97 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:04 -0700 Subject: bpf: Implement bpf_seq_read() for bpf iterator bpf iterator uses seq_file to provide a lossless way to transfer data to user space. But we want to call bpf program after all objects have been traversed, and bpf program may write additional data to the seq_file buffer. The current seq_read() does not work for this use case. Besides allowing stop() function to write to the buffer, the bpf_seq_read() also fixed the buffer size to one page. If any single call of show() or stop() will emit data more than one page to cause overflow, -E2BIG error code will be returned to user space. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175904.2475468-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0542a243b78c..832973ee80fa 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -26,6 +26,129 @@ static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* bpf_seq_read, a customized and simpler version for bpf iterator. + * no_llseek is assumed for this file. + * The following are differences from seq_read(): + * . fixed buffer size (PAGE_SIZE) + * . assuming no_llseek + * . stop() may call bpf program, handling potential overflow there + */ +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + size_t n, offs, copied = 0; + int err = 0; + void *p; + + mutex_lock(&seq->lock); + + if (!seq->buf) { + seq->size = PAGE_SIZE; + seq->buf = kmalloc(seq->size, GFP_KERNEL); + if (!seq->buf) { + err = -ENOMEM; + goto done; + } + } + + if (seq->count) { + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf + seq->from, n); + if (err) { + err = -EFAULT; + goto done; + } + seq->count -= n; + seq->from += n; + copied = n; + goto done; + } + + seq->from = 0; + p = seq->op->start(seq, &seq->index); + if (!p) + goto stop; + if (IS_ERR(p)) { + err = PTR_ERR(p); + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + err = seq->op->show(seq, p); + if (err > 0) { + seq->count = 0; + } else if (err < 0 || seq_has_overflowed(seq)) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + while (1) { + loff_t pos = seq->index; + + offs = seq->count; + p = seq->op->next(seq, p, &seq->index); + if (pos == seq->index) { + pr_info_ratelimited("buggy seq_file .next function %ps " + "did not updated position index\n", + seq->op->next); + seq->index++; + } + + if (IS_ERR_OR_NULL(p)) + break; + + if (seq->count >= size) + break; + + err = seq->op->show(seq, p); + if (err > 0) { + seq->count = offs; + } else if (err < 0 || seq_has_overflowed(seq)) { + seq->count = offs; + if (offs == 0) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + goto done; + } + break; + } + } +stop: + offs = seq->count; + /* bpf program called if !p */ + seq->op->stop(seq, p); + if (!p && seq_has_overflowed(seq)) { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } + } + + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf, n); + if (err) { + err = -EFAULT; + goto done; + } + copied = n; + seq->count -= n; + seq->from = n; +done: + if (!copied) + copied = err; + else + *ppos += copied; + mutex_unlock(&seq->lock); + return copied; +} + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; -- cgit v1.2.3 From ac51d99bf81caac8d8881fe52098948110d0de68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:05 -0700 Subject: bpf: Create anonymous bpf iterator A new bpf command BPF_ITER_CREATE is added. The anonymous bpf iterator is seq_file based. The seq_file private data are referenced by targets. The bpf_iter infrastructure allocated additional space at seq_file->private before the space used by targets to store some meta data, e.g., prog: prog to run session_id: an unique id for each opened seq_file seq_num: how many times bpf programs are queried in this session done_stop: an internal state to decide whether bpf program should be called in seq_ops->stop() or not The seq_num will start from 0 for valid objects. The bpf program may see the same seq_num more than once if - seq_file buffer overflow happens and the same object is retried by bpf_seq_read(), or - the bpf program explicitly requests a retry of the same object Since module is not supported for bpf_iter, all target registeration happens at __init time, so there is no need to change bpf_iter_unreg_target() as it is used mostly in error path of the init function at which time no bpf iterators have been created yet. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175905.2475770-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 26 ++++++++++ 2 files changed, 155 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 832973ee80fa..e7129b57865f 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include +#include #include #include @@ -20,12 +21,24 @@ struct bpf_iter_link { struct bpf_iter_target_info *tinfo; }; +struct bpf_iter_priv_data { + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool done_stop; + u8 target_private[] __aligned(8); +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* incremented on every opened seq_file */ +static atomic64_t session_id; + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -149,6 +162,33 @@ done: return copied; } +static int iter_release(struct inode *inode, struct file *file) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + + seq = file->private_data; + if (!seq) + return 0; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + + if (iter_priv->tinfo->fini_seq_private) + iter_priv->tinfo->fini_seq_private(seq->private); + + bpf_prog_put(iter_priv->prog); + seq->private = iter_priv; + + return seq_release_private(inode, file); +} + +static const struct file_operations bpf_iter_fops = { + .llseek = no_llseek, + .read = bpf_seq_read, + .release = iter_release, +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -309,3 +349,92 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return bpf_link_settle(&link_primer); } + +static void init_seq_meta(struct bpf_iter_priv_data *priv_data, + struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + priv_data->tinfo = tinfo; + priv_data->prog = prog; + priv_data->session_id = atomic64_inc_return(&session_id); + priv_data->seq_num = 0; + priv_data->done_stop = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +{ + struct bpf_iter_priv_data *priv_data; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + struct seq_file *seq; + int err = 0; + + mutex_lock(&link_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&link_mutex); + + tinfo = link->tinfo; + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + + tinfo->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + if (!priv_data) { + err = -ENOMEM; + goto release_prog; + } + + if (tinfo->init_seq_private) { + err = tinfo->init_seq_private(priv_data->target_private); + if (err) + goto release_seq_file; + } + + init_seq_meta(priv_data, tinfo, prog); + seq = file->private_data; + seq->private = priv_data->target_private; + + return 0; + +release_seq_file: + seq_release_private(file->f_inode, file); + file->private_data = NULL; +release_prog: + bpf_prog_put(prog); + return err; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct file *file; + unsigned int flags; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + flags = O_RDONLY | O_CLOEXEC; + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + err = prepare_seq_file(file, + container_of(link, struct bpf_iter_link, link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ffe2d8fb6c7..a293e88ee01a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3941,6 +3941,29 @@ static int bpf_enable_stats(union bpf_attr *attr) return -EINVAL; } +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4068,6 +4091,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ENABLE_STATS: err = bpf_enable_stats(&attr); break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 367ec3e4834cbd611401c2c40a23c22c825474f1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:06 -0700 Subject: bpf: Create file bpf iterator To produce a file bpf iterator, the fd must be corresponding to a link_fd assocciated with a trace/iter program. When the pinned file is opened, a seq_file will be generated. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175906.2475893-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 17 ++++++++++++++++- kernel/bpf/inode.c | 5 ++++- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index e7129b57865f..090f09b0eacb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -39,6 +39,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -162,6 +164,13 @@ done: return copied; } +static int iter_open(struct inode *inode, struct file *file) +{ + struct bpf_iter_link *link = inode->i_private; + + return prepare_seq_file(file, link); +} + static int iter_release(struct inode *inode, struct file *file) { struct bpf_iter_priv_data *iter_priv; @@ -183,7 +192,8 @@ static int iter_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static const struct file_operations bpf_iter_fops = { +const struct file_operations bpf_iter_fops = { + .open = iter_open, .llseek = no_llseek, .read = bpf_seq_read, .release = iter_release, @@ -310,6 +320,11 @@ static const struct bpf_link_ops bpf_iter_link_lops = { .update_prog = bpf_iter_link_replace, }; +bool bpf_link_is_iter(struct bpf_link *link) +{ + return link->ops == &bpf_iter_link_lops; +} + int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 95087d9f4ed3..fb878ba3f22f 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) { + struct bpf_link *link = arg; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, - &bpffs_obj_fops); + bpf_link_is_iter(link) ? + &bpf_iter_fops : &bpffs_obj_fops); } static struct dentry * -- cgit v1.2.3 From e5158d987b72c3f318b4b52a01ac6f3997bd0c00 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:07 -0700 Subject: bpf: Implement common macros/helpers for target iterators Macro DEFINE_BPF_ITER_FUNC is implemented so target can define an init function to capture the BTF type which represents the target. The bpf_iter_meta is a structure holding meta data, common to all targets in the bpf program. Additional marker functions are called before or after bpf_seq_read() show()/next()/stop() callback functions to help calculate precise seq_num and whether call bpf_prog inside stop(). Two functions, bpf_iter_get_info() and bpf_iter_run_prog(), are implemented so target can get needed information from bpf_iter infrastructure and can run the program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175907.2475956-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 090f09b0eacb..30efd15cd4a0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -41,6 +41,33 @@ static atomic64_t session_id; static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static void bpf_iter_inc_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num++; +} + +static void bpf_iter_dec_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num--; +} + +static void bpf_iter_done_stop(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->done_stop = true; +} + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -93,6 +120,10 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, err = seq->op->show(seq, p); if (err > 0) { + /* object is skipped, decrease seq_num, so next + * valid object can reuse the same seq_num. + */ + bpf_iter_dec_seq_num(seq); seq->count = 0; } else if (err < 0 || seq_has_overflowed(seq)) { if (!err) @@ -117,11 +148,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, if (IS_ERR_OR_NULL(p)) break; + /* got a valid next object, increase seq_num */ + bpf_iter_inc_seq_num(seq); + if (seq->count >= size) break; err = seq->op->show(seq, p); if (err > 0) { + bpf_iter_dec_seq_num(seq); seq->count = offs; } else if (err < 0 || seq_has_overflowed(seq)) { seq->count = offs; @@ -138,11 +173,15 @@ stop: offs = seq->count; /* bpf program called if !p */ seq->op->stop(seq, p); - if (!p && seq_has_overflowed(seq)) { - seq->count = offs; - if (offs == 0) { - err = -E2BIG; - goto done; + if (!p) { + if (!seq_has_overflowed(seq)) { + bpf_iter_done_stop(seq); + } else { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } } } @@ -453,3 +492,39 @@ free_fd: put_unused_fd(fd); return err; } + +struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + void *seq_priv; + + seq = meta->seq; + if (seq->file->f_op != &bpf_iter_fops) + return NULL; + + seq_priv = seq->private; + iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, + target_private); + + if (in_stop && iter_priv->done_stop) + return NULL; + + meta->session_id = iter_priv->session_id; + meta->seq_num = iter_priv->seq_num; + + return iter_priv->prog; +} + +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) +{ + int ret; + + rcu_read_lock(); + migrate_disable(); + ret = BPF_PROG_RUN(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + + return ret == 0 ? 0 : -EAGAIN; +} -- cgit v1.2.3 From 6086d29def80edd78f9832ea6eafa74e3818f6a7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:09 -0700 Subject: bpf: Add bpf_map iterator Implement seq_file operations to traverse all bpf_maps. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175909.2476096-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/map_iter.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 19 ++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/map_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a8b0febd3f6..b2b5eefc5254 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c new file mode 100644 index 000000000000..8162e0c00b9f --- /dev/null +++ b/kernel/bpf/map_iter.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include + +struct bpf_iter_seq_map_info { + u32 mid; +}; + +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + ++*pos; + return map; +} + +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + ++*pos; + ++info->mid; + bpf_map_put((struct bpf_map *)v); + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + return map; +} + +struct bpf_iter__bpf_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); +}; + +DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map) + +static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_map ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.map = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_map_seq_show(seq, v, false); +} + +static void bpf_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_map_seq_show(seq, v, true); + else + bpf_map_put((struct bpf_map *)v); +} + +static const struct seq_operations bpf_map_seq_ops = { + .start = bpf_map_seq_start, + .next = bpf_map_seq_next, + .stop = bpf_map_seq_stop, + .show = bpf_map_seq_show, +}; + +static int __init bpf_map_iter_init(void) +{ + struct bpf_iter_reg reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + }; + + return bpf_iter_reg_target(®_info); +} + +late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a293e88ee01a..de2a75500233 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2934,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +struct bpf_map *bpf_map_get_curr_or_next(u32 *id) +{ + struct bpf_map *map; + + spin_lock_bh(&map_idr_lock); +again: + map = idr_get_next(&map_idr, id); + if (map) { + map = __bpf_map_inc_not_zero(map, false); + if (IS_ERR(map)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&map_idr_lock); + + return map; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) -- cgit v1.2.3 From eaaacd23910f2d7c4b22d43f591002cc217d294b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:11 -0700 Subject: bpf: Add task and task/file iterator targets Only the tasks belonging to "current" pid namespace are enumerated. For task/file target, the bpf program will have access to struct task_struct *task u32 fd struct file *file where fd/file is an open file for the task. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175911.2476407-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/task_iter.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/task_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index b2b5eefc5254..37b2d8620153 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c new file mode 100644 index 000000000000..aeed662d8451 --- /dev/null +++ b/kernel/bpf/task_iter.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include +#include +#include +#include + +struct bpf_iter_seq_task_common { + struct pid_namespace *ns; +}; + +struct bpf_iter_seq_task_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + u32 tid; +}; + +static struct task_struct *task_seq_get_next(struct pid_namespace *ns, + u32 *tid) +{ + struct task_struct *task = NULL; + struct pid *pid; + + rcu_read_lock(); + pid = idr_get_next(&ns->idr, tid); + if (pid) + task = get_pid_task(pid, PIDTYPE_PID); + rcu_read_unlock(); + + return task; +} + +static void *task_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + ++*pos; + return task; +} + +static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + ++*pos; + ++info->tid; + put_task_struct((struct task_struct *)v); + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + return task; +} + +struct bpf_iter__task { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); +}; + +DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) + +static int __task_seq_show(struct seq_file *seq, struct task_struct *task, + bool in_stop) +{ + struct bpf_iter_meta meta; + struct bpf_iter__task ctx; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + meta.seq = seq; + ctx.meta = &meta; + ctx.task = task; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_seq_show(struct seq_file *seq, void *v) +{ + return __task_seq_show(seq, v, false); +} + +static void task_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__task_seq_show(seq, v, true); + else + put_task_struct((struct task_struct *)v); +} + +static const struct seq_operations task_seq_ops = { + .start = task_seq_start, + .next = task_seq_next, + .stop = task_seq_stop, + .show = task_seq_show, +}; + +struct bpf_iter_seq_task_file_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct files_struct *files; + u32 tid; + u32 fd; +}; + +static struct file * +task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, + struct task_struct **task, struct files_struct **fstruct) +{ + struct pid_namespace *ns = info->common.ns; + u32 curr_tid = info->tid, max_fds; + struct files_struct *curr_files; + struct task_struct *curr_task; + int curr_fd = info->fd; + + /* If this function returns a non-NULL file object, + * it held a reference to the task/files_struct/file. + * Otherwise, it does not hold any reference. + */ +again: + if (*task) { + curr_task = *task; + curr_files = *fstruct; + curr_fd = info->fd; + } else { + curr_task = task_seq_get_next(ns, &curr_tid); + if (!curr_task) + return NULL; + + curr_files = get_files_struct(curr_task); + if (!curr_files) { + put_task_struct(curr_task); + curr_tid = ++(info->tid); + info->fd = 0; + goto again; + } + + /* set *fstruct, *task and info->tid */ + *fstruct = curr_files; + *task = curr_task; + if (curr_tid == info->tid) { + curr_fd = info->fd; + } else { + info->tid = curr_tid; + curr_fd = 0; + } + } + + rcu_read_lock(); + max_fds = files_fdtable(curr_files)->max_fds; + for (; curr_fd < max_fds; curr_fd++) { + struct file *f; + + f = fcheck_files(curr_files, curr_fd); + if (!f) + continue; + + /* set info->fd */ + info->fd = curr_fd; + get_file(f); + rcu_read_unlock(); + return f; + } + + /* the current task is done, go to the next task */ + rcu_read_unlock(); + put_files_struct(curr_files); + put_task_struct(curr_task); + *task = NULL; + *fstruct = NULL; + info->fd = 0; + curr_tid = ++(info->tid); + goto again; +} + +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = NULL; + struct task_struct *task = NULL; + struct file *file; + + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + ++*pos; + info->task = task; + info->files = files; + + return file; +} + +static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = info->files; + struct task_struct *task = info->task; + struct file *file; + + ++*pos; + ++info->fd; + fput((struct file *)v); + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + info->task = task; + info->files = files; + + return file; +} + +struct bpf_iter__task_file { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + u32 fd __aligned(8); + __bpf_md_ptr(struct file *, file); +}; + +DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, + struct task_struct *task, u32 fd, + struct file *file) + +static int __task_file_seq_show(struct seq_file *seq, struct file *file, + bool in_stop) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct bpf_iter__task_file ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.fd = info->fd; + ctx.file = file; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_file_seq_show(struct seq_file *seq, void *v) +{ + return __task_file_seq_show(seq, v, false); +} + +static void task_file_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + + if (!v) { + (void)__task_file_seq_show(seq, v, true); + } else { + fput((struct file *)v); + put_files_struct(info->files); + put_task_struct(info->task); + info->files = NULL; + info->task = NULL; + } +} + +static int init_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + common->ns = get_pid_ns(task_active_pid_ns(current)); + return 0; +} + +static void fini_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + put_pid_ns(common->ns); +} + +static const struct seq_operations task_file_seq_ops = { + .start = task_file_seq_start, + .next = task_file_seq_next, + .stop = task_file_seq_stop, + .show = task_file_seq_show, +}; + +static int __init task_iter_init(void) +{ + struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + }; + struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + }; + int ret; + + ret = bpf_iter_reg_target(&task_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&task_file_reg_info); +} +late_initcall(task_iter_init); -- cgit v1.2.3 From b121b341e5983bdccf7a5d6cf9236a45c965a31f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:12 -0700 Subject: bpf: Add PTR_TO_BTF_ID_OR_NULL support Add bpf_reg_type PTR_TO_BTF_ID_OR_NULL support. For tracing/iter program, the bpf program context definition, e.g., for previous bpf_map target, looks like struct bpf_iter__bpf_map { struct bpf_iter_meta *meta; struct bpf_map *map; }; The kernel guarantees that meta is not NULL, but map pointer maybe NULL. The NULL map indicates that all objects have been traversed, so bpf program can take proper action, e.g., do final aggregation and/or send final report to user space. Add btf_id_or_null_non0_off to prog->aux structure, to indicate that if the context access offset is not 0, set to PTR_TO_BTF_ID_OR_NULL instead of PTR_TO_BTF_ID. This bit is set for tracing/iter program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175912.2476576-1-yhs@fb.com --- kernel/bpf/btf.c | 5 ++++- kernel/bpf/verifier.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2cfba89a8e1..c490fbde22d4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3790,7 +3790,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - info->reg_type = PTR_TO_BTF_ID; + if (off != 0 && prog->aux->btf_id_or_null_non0_off) + info->reg_type = PTR_TO_BTF_ID_OR_NULL; + else + info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d725ff7d11db..36b2a38a06fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_BTF_ID_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -483,6 +484,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", }; static char slot_type_char[] = { @@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID) + if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return true; default: return false; @@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID) + if (reg_type == PTR_TO_BTF_ID || + reg_type == PTR_TO_BTF_ID_OR_NULL) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -6572,6 +6576,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCK_COMMON; } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { reg->type = PTR_TO_TCP_SOCK; + } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { + reg->type = PTR_TO_BTF_ID; } if (is_null) { /* We don't need id and ref_obj_id from this point @@ -8429,6 +8435,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -10640,6 +10647,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; + prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; -- cgit v1.2.3 From 492e639f0c222784e2e0f121966375f641c61b15 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:14 -0700 Subject: bpf: Add bpf_seq_printf and bpf_seq_write helpers Two helpers bpf_seq_printf and bpf_seq_write, are added for writing data to the seq_file buffer. bpf_seq_printf supports common format string flag/width/type fields so at least I can get identical results for netlink and ipv6_route targets. For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW specifically indicates a write failure due to overflow, which means the object will be repeated in the next bpf invocation if object collection stays the same. Note that if the object collection is changed, depending how collection traversal is done, even if the object still in the collection, it may not be visited. For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to read kernel memory. Reading kernel memory may fail in the following two cases: - invalid kernel address, or - valid kernel address but requiring a major fault If reading kernel memory failed, the %s string will be an empty string and %p{i,I}{4,6} will be all 0. Not returning error to bpf program is consistent with what bpf_trace_printk() does for now. bpf_seq_printf may return -EBUSY meaning that internal percpu buffer for memory copy of strings or other pointees is not available. Bpf program can return 1 to indicate it wants the same object to be repeated. Right now, this should not happen on no-RT kernels since migrate_disable(), which guards bpf prog call, calls preempt_disable(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e875c95d3ced..d961428fb5b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +#define MAX_SEQ_PRINTF_VARARGS 12 +#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 +#define MAX_SEQ_PRINTF_STR_LEN 128 + +struct bpf_seq_printf_buf { + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); +static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) +{ + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; + int i, buf_used, copy_size, num_args; + u64 params[MAX_SEQ_PRINTF_VARARGS]; + struct bpf_seq_printf_buf *bufs; + const u64 *args = data; + + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); + if (WARN_ON_ONCE(buf_used > 1)) { + err = -EBUSY; + goto out; + } + + bufs = this_cpu_ptr(&bpf_seq_printf_buf); + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + goto out; + + if (data_len & 7) + goto out; + + for (i = 0; i < fmt_size; i++) { + if (fmt[i] == '%') { + if (fmt[i + 1] == '%') + i++; + else if (!data || !data_len) + goto out; + } + } + + num_args = data_len / 8; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + /* only printable ascii for now. */ + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { + err = -E2BIG; + goto out; + } + + if (fmt_cnt >= num_args) { + err = -EINVAL; + goto out; + } + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + + /* skip optional "[0 +-][num]" width formating field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 's') { + /* try our best to copy */ + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); + if (err < 0) + bufs->buf[memcpy_cnt][0] = '\0'; + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'p') { + if (fmt[i + 1] == 0 || + fmt[i + 1] == 'K' || + fmt[i + 1] == 'x') { + /* just kernel pointers */ + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { + err = -EINVAL; + goto out; + } + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { + err = -EINVAL; + goto out; + } + + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + + err = probe_kernel_read(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + copy_size); + if (err < 0) + memset(bufs->buf[memcpy_cnt], 0, copy_size); + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + i += 2; + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'l') { + i++; + if (fmt[i] == 'l') + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') { + err = -EINVAL; + goto out; + } + + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + } + + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + * all of them to seq_printf(). + */ + seq_printf(m, fmt, params[0], params[1], params[2], params[3], + params[4], params[5], params[6], params[7], params[8], + params[9], params[10], params[11]); + + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; +out: + this_cpu_dec(bpf_seq_printf_buf_used); + return err; +} + +static int bpf_seq_printf_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_printf_btf_ids, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +static int bpf_seq_write_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_write_btf_ids, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; #endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; default: return raw_tp_prog_func_proto(func_id, prog); } -- cgit v1.2.3 From 1d68f22b3d53d368d5cc8d09de890250cae5c945 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:15 -0700 Subject: bpf: Handle spilled PTR_TO_BTF_ID properly when checking stack_boundary This specifically to handle the case like below: // ptr below is a socket ptr identified by PTR_TO_BTF_ID u64 param[2] = { ptr, val }; bpf_seq_printf(seq, fmt, sizeof(fmt), param, sizeof(param)); In this case, the 16 bytes stack for "param" contains: 8 bytes for ptr with spilled PTR_TO_BTF_ID 8 bytes for val as STACK_MISC The current verifier will complain the ptr should not be visible to the helper. ... 16: (7b) *(u64 *)(r10 -64) = r2 18: (7b) *(u64 *)(r10 -56) = r1 19: (bf) r4 = r10 ; 20: (07) r4 += -64 ; BPF_SEQ_PRINTF(seq, fmt1, (long)s, s->sk_protocol); 21: (bf) r1 = r6 22: (18) r2 = 0xffffa8d00018605a 24: (b4) w3 = 10 25: (b4) w5 = 16 26: (85) call bpf_seq_printf#125 R0=inv(id=0) R1_w=ptr_seq_file(id=0,off=0,imm=0) R2_w=map_value(id=0,off=90,ks=4,vs=144,imm=0) R3_w=inv10 R4_w=fp-64 R5_w=inv16 R6=ptr_seq_file(id=0,off=0,imm=0) R7=ptr_netlink_sock(id=0,off=0,imm=0) R10=fp0 fp-56_w=mmmmmmmm fp-64_w=ptr_ last_idx 26 first_idx 13 regs=8 stack=0 before 25: (b4) w5 = 16 regs=8 stack=0 before 24: (b4) w3 = 10 invalid indirect read from stack off -64+0 size 16 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175915.2476783-1-yhs@fb.com --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36b2a38a06fe..2a1826c76bb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3494,6 +3494,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + goto mark; + if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); -- cgit v1.2.3 From 9c5f8a1008a121e4c6b24af211034e24b0b63081 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:16 -0700 Subject: bpf: Support variable length array in tracing programs In /proc/net/ipv6_route, we have struct fib6_info { struct fib6_table *fib6_table; ... struct fib6_nh fib6_nh[0]; } struct fib6_nh { struct fib_nh_common nh_common; struct rt6_info **rt6i_pcpu; struct rt6_exception_bucket *rt6i_exception_bucket; }; struct fib_nh_common { ... u8 nhc_gw_family; ... } The access: struct fib6_nh *fib6_nh = &rt->fib6_nh; ... fib6_nh->nh_common.nhc_gw_family ... This patch ensures such an access is handled properly. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175916.2476853-1-yhs@fb.com --- kernel/bpf/btf.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c490fbde22d4..dcd233139294 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3833,6 +3833,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname; + u32 vlen; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3841,7 +3842,43 @@ again: return -EINVAL; } + vlen = btf_type_vlen(t); if (off + size > t->size) { + /* If the last element is a variable size array, we may + * need to relax the rule. + */ + struct btf_array *array_elem; + + if (vlen == 0) + goto error; + + member = btf_type_member(t) + vlen - 1; + mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, + NULL); + if (!btf_type_is_array(mtype)) + goto error; + + array_elem = (struct btf_array *)(mtype + 1); + if (array_elem->nelems != 0) + goto error; + + moff = btf_member_bit_offset(t, member) / 8; + if (off < moff) + goto error; + + /* Only allow structure for now, can be relaxed for + * other types later. + */ + elem_type = btf_type_skip_modifiers(btf_vmlinux, + array_elem->type, NULL); + if (!btf_type_is_struct(elem_type)) + goto error; + + off = (off - moff) % elem_type->size; + return btf_struct_access(log, elem_type, off, size, atype, + next_btf_id); + +error: bpf_log(log, "access beyond struct %s at off %u size %u\n", tname, off, size); return -EACCES; -- cgit v1.2.3 From b6522fa409cfafbc3968679b09e4028f0609f2b9 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Sat, 11 Apr 2020 21:06:19 +0800 Subject: parisc: add sysctl file interface panic_on_stackoverflow The variable sysctl_panic_on_stackoverflow is used in arch/parisc/kernel/irq.c and arch/x86/kernel/irq_32.c, but the sysctl file interface panic_on_stackoverflow only exists on x86. Add sysctl file interface panic_on_stackoverflow for parisc Signed-off-by: Xiaoming Ni Reviewed-by: Luis Chamberlain Signed-off-by: Helge Deller --- kernel/sysctl.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..b9ff323e1d26 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -994,30 +994,32 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) + +#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ + defined(CONFIG_DEBUG_STACKOVERFLOW) { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#if defined(CONFIG_X86) { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_DEBUG_STACKOVERFLOW { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#endif { .procname = "bootloader_type", .data = &bootloader_type, -- cgit v1.2.3 From a4ae16f65c335f8be58b67b78628c788c4b325a5 Mon Sep 17 00:00:00 2001 From: Samuel Zou Date: Sat, 9 May 2020 09:16:41 +0800 Subject: livepatch: Make klp_apply_object_relocs static Fix the following sparse warning: kernel/livepatch/core.c:748:5: warning: symbol 'klp_apply_object_relocs' was not declared. The klp_apply_object_relocs() has only one call site within core.c; it should be static Fixes: 7c8e2bdd5f0d ("livepatch: Apply vmlinux-specific KLP relocations early") Reported-by: Hulk Robot Signed-off-by: Samuel Zou Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 96d2da14eb0d..f76fdb925532 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -745,7 +745,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } -int klp_apply_object_relocs(struct klp_patch *patch, struct klp_object *obj) +static int klp_apply_object_relocs(struct klp_patch *patch, + struct klp_object *obj) { int i, ret; struct klp_modinfo *info = patch->mod->klp_info; -- cgit v1.2.3 From b92b36eadf4d7fa4a34f048c2a3bb61a735a885e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 May 2020 18:07:40 +0300 Subject: workqueue: Fix an use after free in init_rescuer() We need to preserve error code before freeing "rescuer". Fixes: f187b6974f6df ("workqueue: Use IS_ERR and PTR_ERR instead of PTR_ERR_OR_ZERO.") Signed-off-by: Dan Carpenter Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ddf0537dce14..10ed8d761e0b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4197,6 +4197,7 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; + int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; @@ -4208,8 +4209,9 @@ static int init_rescuer(struct workqueue_struct *wq) rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); if (IS_ERR(rescuer->task)) { + ret = PTR_ERR(rescuer->task); kfree(rescuer); - return PTR_ERR(rescuer->task); + return ret; } wq->rescuer = rescuer; -- cgit v1.2.3 From 385bbf7b119a4feb6d6bcf3586f1bb1dd9c5b0a0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:50:57 -0500 Subject: bpf, libbpf: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200507185057.GA13981@embeddedor --- kernel/bpf/queue_stack_maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f697647ceb54..30e1373fd437 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -19,7 +19,7 @@ struct bpf_queue_stack { u32 head, tail; u32 size; /* max_entries + 1 */ - char elements[0] __aligned(8); + char elements[] __aligned(8); }; static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) -- cgit v1.2.3 From 90b5363acd4739769c3f38c1aff16171bd133e8c Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Fri, 27 Mar 2020 11:44:56 +0100 Subject: sched: Clean up scheduler_ipi() The scheduler IPI has grown weird and wonderful over the years, time for spring cleaning. Move all the non-trivial stuff out of it and into a regular smp function call IPI. This then reduces the schedule_ipi() to most of it's former NOP glory and ensures to keep the interrupt vector lean and mean. Aside of that avoiding the full irq_enter() in the x86 IPI implementation is incorrect as scheduler_ipi() can be instrumented. To work around that scheduler_ipi() had an irq_enter/exit() hack when heavy work was pending. This is gone now. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134058.361859938@linutronix.de --- kernel/sched/core.c | 64 ++++++++++++++++++++++++---------------------------- kernel/sched/fair.c | 5 ++-- kernel/sched/sched.h | 6 +++-- 3 files changed, 36 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b58efb1156eb..cd2070d6f1e4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -219,6 +219,13 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } +static inline void +rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) +{ + csd->flags = 0; + csd->func = func; + csd->info = rq; +} #ifdef CONFIG_SCHED_HRTICK /* @@ -314,16 +321,14 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED_HARD); } + #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; + rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } @@ -650,6 +655,16 @@ static inline bool got_nohz_idle_kick(void) return false; } +static void nohz_csd_func(void *info) +{ + struct rq *rq = info; + + if (got_nohz_idle_kick()) { + rq->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } +} + #else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) @@ -2292,6 +2307,11 @@ void sched_ttwu_pending(void) rq_unlock_irqrestore(rq, &rf); } +static void wake_csd_func(void *info) +{ + sched_ttwu_pending(); +} + void scheduler_ipi(void) { /* @@ -2300,34 +2320,6 @@ void scheduler_ipi(void) * this IPI. */ preempt_fold_need_resched(); - - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); - sched_ttwu_pending(); - - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) @@ -2336,9 +2328,9 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (llist_add(&p->wake_entry, &rq->wake_list)) { if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); + smp_call_function_single_async(cpu, &rq->wake_csd); else trace_sched_wake_idle_without_ipi(cpu); } @@ -6693,12 +6685,16 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + rq_csd_init(rq, &rq->wake_csd, wake_csd_func); + INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); + + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 46b7bd41573f..6b7f1474e2d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10000,12 +10000,11 @@ static void kick_ilb(unsigned int flags) return; /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which + * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ - smp_send_reschedule(ilb_cpu); + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 978c6fac8cb8..21416b30c520 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -889,9 +889,10 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; + call_single_data_t nohz_csd; #endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; - atomic_t nohz_flags; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ unsigned long nr_load_updates; @@ -978,7 +979,7 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1020,6 +1021,7 @@ struct rq { #endif #ifdef CONFIG_SMP + call_single_data_t wake_csd; struct llist_head wake_list; #endif -- cgit v1.2.3 From 2a0a24ebb499c9d499eea948d3fc108f936e36d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 27 Mar 2020 12:42:00 +0100 Subject: sched: Make scheduler_ipi inline Now that the scheduler IPI is trivial and simple again there is no point to have the little function out of line. This simplifies the effort of constraining the instrumentation nicely. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134058.453581595@linutronix.de --- kernel/sched/core.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cd2070d6f1e4..74fb89b5ce3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2312,16 +2312,6 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } -void scheduler_ipi(void) -{ - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); -} - static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); -- cgit v1.2.3 From 4fdd88877e5259dcc5e504dca449acc0324d71b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Mar 2020 23:49:36 +0900 Subject: kprobes: Lock kprobe_mutex while showing kprobe_blacklist Lock kprobe_mutex while showing kprobe_blacklist to prevent updating the kprobe_blacklist. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.571125195@linutronix.de --- kernel/kprobes.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..570d60827656 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2420,6 +2420,7 @@ static const struct file_operations debugfs_kprobes_operations = { /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&kprobe_mutex); return seq_list_start(&kprobe_blacklist, *pos); } @@ -2446,10 +2447,15 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) return 0; } +static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) +{ + mutex_unlock(&kprobe_mutex); +} + static const struct seq_operations kprobe_blacklist_seq_ops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, - .stop = kprobe_seq_stop, /* Reuse void function */ + .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; -- cgit v1.2.3 From 1e6769b0aece51ea7a3dc3117c37d4a5669e4a21 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Mar 2020 23:49:48 +0900 Subject: kprobes: Support __kprobes blacklist in modules Support __kprobes attribute for blacklist functions in modules. The __kprobes attribute functions are stored in .kprobes.text section. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.678201813@linutronix.de --- kernel/kprobes.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kernel/module.c | 4 ++++ 2 files changed, 46 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 570d60827656..b7549992b9bd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2179,6 +2179,19 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2215,6 +2228,28 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +static void add_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_add_area_blacklist(start, end); + } +} + +static void remove_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_remove_area_blacklist(start, end); + } +} + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2225,6 +2260,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) { + mutex_lock(&kprobe_mutex); + add_module_kprobe_blacklist(mod); + mutex_unlock(&kprobe_mutex); + } if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2255,6 +2295,8 @@ static int kprobes_module_callback(struct notifier_block *nb, kill_kprobe(p); } } + if (val == MODULE_STATE_GOING) + remove_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..978f3fa40850 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3193,6 +3193,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->ei_funcs = section_objs(info, "_error_injection_whitelist", sizeof(*mod->ei_funcs), &mod->num_ei_funcs); +#endif +#ifdef CONFIG_KPROBES + mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, + &mod->kprobes_text_size); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit v1.2.3 From 16db6264c93d2d7df9eb8be5d9eb717ab30105fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Mar 2020 23:50:00 +0900 Subject: kprobes: Support NOKPROBE_SYMBOL() in modules Support NOKPROBE_SYMBOL() in modules. NOKPROBE_SYMBOL() records only symbol address in "_kprobe_blacklist" section in the module. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.771170126@linutronix.de --- kernel/kprobes.c | 17 +++++++++++++++++ kernel/module.c | 3 +++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b7549992b9bd..9eb5acf0a9f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2192,6 +2192,11 @@ static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) } } +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2231,6 +2236,12 @@ static int __init populate_kprobe_blacklist(unsigned long *start, static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]); + } start = (unsigned long)mod->kprobes_text_start; if (start) { @@ -2242,6 +2253,12 @@ static void add_module_kprobe_blacklist(struct module *mod) static void remove_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]); + } start = (unsigned long)mod->kprobes_text_start; if (start) { diff --git a/kernel/module.c b/kernel/module.c index 978f3fa40850..faf733789560 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3197,6 +3197,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) #ifdef CONFIG_KPROBES mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, &mod->kprobes_text_size); + mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", + sizeof(unsigned long), + &mod->num_kprobe_blacklist); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit v1.2.3 From 303cc571d107b3641d6487061b748e70ffe15ce4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 5 May 2020 16:04:31 +0200 Subject: nsproxy: attach to namespaces via pidfds For quite a while we have been thinking about using pidfds to attach to namespaces. This patchset has existed for about a year already but we've wanted to wait to see how the general api would be received and adopted. Now that more and more programs in userspace have started using pidfds for process management it's time to send this one out. This patch makes it possible to use pidfds to attach to the namespaces of another process, i.e. they can be passed as the first argument to the setns() syscall. When only a single namespace type is specified the semantics are equivalent to passing an nsfd. That means setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However, when a pidfd is passed, multiple namespace flags can be specified in the second setns() argument and setns() will attach the caller to all the specified namespaces all at once or to none of them. Specifying 0 is not valid together with a pidfd. Here are just two obvious examples: setns(pidfd, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET); setns(pidfd, CLONE_NEWUSER); Allowing to also attach subsets of namespaces supports various use-cases where callers setns to a subset of namespaces to retain privilege, perform an action and then re-attach another subset of namespaces. If the need arises, as Eric suggested, we can extend this patchset to assume even more context than just attaching all namespaces. His suggestion specifically was about assuming the process' root directory when setns(pidfd, 0) or setns(pidfd, SETNS_PIDFD) is specified. For now, just keep it flexible in terms of supporting subsets of namespaces but let's wait until we have users asking for even more context to be assumed. At that point we can add an extension. The obvious example where this is useful is a standard container manager interacting with a running container: pushing and pulling files or directories, injecting mounts, attaching/execing any kind of process, managing network devices all these operations require attaching to all or at least multiple namespaces at the same time. Given that nowadays most containers are spawned with all namespaces enabled we're currently looking at at least 14 syscalls, 7 to open the /proc//ns/ nsfds, another 7 to actually perform the namespace switch. With time namespaces we're looking at about 16 syscalls. (We could amortize the first 7 or 8 syscalls for opening the nsfds by stashing them in each container's monitor process but that would mean we need to send around those file descriptors through unix sockets everytime we want to interact with the container or keep on-disk state. Even in scenarios where a caller wants to join a particular namespace in a particular order callers still profit from batching other namespaces. That mostly applies to the user namespace but all container runtimes I found join the user namespace first no matter if it privileges or deprivileges the container similar to how unshare behaves.) With pidfds this becomes a single syscall no matter how many namespaces are supposed to be attached to. A decently designed, large-scale container manager usually isn't the parent of any of the containers it spawns so the containers don't die when it crashes or needs to update or reinitialize. This means that for the manager to interact with containers through pids is inherently racy especially on systems where the maximum pid number is not significicantly bumped. This is even more problematic since we often spawn and manage thousands or ten-thousands of containers. Interacting with a container through a pid thus can become risky quite quickly. Especially since we allow for an administrator to enable advanced features such as syscall interception where we're performing syscalls in lieu of the container. In all of those cases we use pidfds if they are available and we pass them around as stable references. Using them to setns() to the target process' namespaces is as reliable as using nsfds. Either the target process is already dead and we get ESRCH or we manage to attach to its namespaces but we can't accidently attach to another process' namespaces. So pidfds lend themselves to be used with this api. The other main advantage is that with this change the pidfd becomes the only relevant token for most container interactions and it's the only token we need to create and send around. Apart from significiantly reducing the number of syscalls from double digit to single digit which is a decent reason post-spectre/meltdown this also allows to switch to a set of namespaces atomically, i.e. either attaching to all the specified namespaces succeeds or we fail. If we fail we haven't changed a single namespace. There are currently three namespaces that can fail (other than for ENOMEM which really is not very interesting since we then have other problems anyway) for non-trivial reasons, user, mount, and pid namespaces. We can fail to attach to a pid namespace if it is not our current active pid namespace or a descendant of it. We can fail to attach to a user namespace because we are multi-threaded or because our current mount namespace shares filesystem state with other tasks, or because we're trying to setns() to the same user namespace, i.e. the target task has the same user namespace as we do. We can fail to attach to a mount namespace because it shares filesystem state with other tasks or because we fail to lookup the new root for the new mount namespace. In most non-pathological scenarios these issues can be somewhat mitigated. But there are cases where we're half-attached to some namespace and failing to attach to another one. I've talked about some of these problem during the hallway track (something only the pre-COVID-19 generation will remember) of Plumbers in Los Angeles in 2018(?). Even if all these issues could be avoided with super careful userspace coding it would be nicer to have this done in-kernel. Pidfds seem to lend themselves nicely for this. The other neat thing about this is that setns() becomes an actual counterpart to the namespace bits of unshare(). Signed-off-by: Christian Brauner Reviewed-by: Serge Hallyn Cc: Eric W. Biederman Cc: Serge Hallyn Cc: Jann Horn Cc: Michael Kerrisk Cc: Aleksa Sarai Link: https://lore.kernel.org/r/20200505140432.181565-3-christian.brauner@ubuntu.com --- kernel/nsproxy.c | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 213 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b7954fd60475..b03df67621d0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -258,17 +259,58 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +static int check_setns_flags(unsigned long flags) +{ + if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | + CLONE_NEWCGROUP))) + return -EINVAL; + +#ifndef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + return -EINVAL; +#endif +#ifndef CONFIG_PID_NS + if (flags & CLONE_NEWPID) + return -EINVAL; +#endif +#ifndef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) + return -EINVAL; +#endif +#ifndef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + return -EINVAL; +#endif +#ifndef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) + return -EINVAL; +#endif +#ifndef CONFIG_NET_NS + if (flags & CLONE_NEWNET) + return -EINVAL; +#endif + + return 0; +} + static void put_nsset(struct nsset *nsset) { unsigned flags = nsset->flags; if (flags & CLONE_NEWUSER) put_cred(nsset_cred(nsset)); + /* + * We only created a temporary copy if we attached to more than just + * the mount namespace. + */ + if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) + free_fs_struct(nsset->fs); if (nsset->nsproxy) free_nsproxy(nsset->nsproxy); } -static int prepare_nsset(int nstype, struct nsset *nsset) +static int prepare_nsset(unsigned flags, struct nsset *nsset) { struct task_struct *me = current; @@ -276,17 +318,23 @@ static int prepare_nsset(int nstype, struct nsset *nsset) if (IS_ERR(nsset->nsproxy)) return PTR_ERR(nsset->nsproxy); - if (nstype == CLONE_NEWUSER) + if (flags & CLONE_NEWUSER) nsset->cred = prepare_creds(); else nsset->cred = current_cred(); if (!nsset->cred) goto out; - if (nstype == CLONE_NEWNS) + /* Only create a temporary copy of fs_struct if we really need to. */ + if (flags == CLONE_NEWNS) { nsset->fs = me->fs; + } else if (flags & CLONE_NEWNS) { + nsset->fs = copy_fs_struct(me->fs); + if (!nsset->fs) + goto out; + } - nsset->flags = nstype; + nsset->flags = flags; return 0; out: @@ -294,6 +342,138 @@ out: return -ENOMEM; } +static inline int validate_ns(struct nsset *nsset, struct ns_common *ns) +{ + return ns->ops->install(nsset, ns); +} + +/* + * This is the inverse operation to unshare(). + * Ordering is equivalent to the standard ordering used everywhere else + * during unshare and process creation. The switch to the new set of + * namespaces occurs at the point of no return after installation of + * all requested namespaces was successful in commit_nsset(). + */ +static int validate_nsset(struct nsset *nsset, struct pid *pid) +{ + int ret = 0; + unsigned flags = nsset->flags; + struct user_namespace *user_ns = NULL; + struct pid_namespace *pid_ns = NULL; + struct nsproxy *nsp; + struct task_struct *tsk; + + /* Take a "snapshot" of the target task's namespaces. */ + rcu_read_lock(); + tsk = pid_task(pid, PIDTYPE_PID); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + + if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + return -EPERM; + } + + task_lock(tsk); + nsp = tsk->nsproxy; + if (nsp) + get_nsproxy(nsp); + task_unlock(tsk); + if (!nsp) { + rcu_read_unlock(); + return -ESRCH; + } + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + pid_ns = task_active_pid_ns(tsk); + if (unlikely(!pid_ns)) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_pid_ns(pid_ns); + } +#endif + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + user_ns = get_user_ns(__task_cred(tsk)->user_ns); +#endif + rcu_read_unlock(); + + /* + * Install requested namespaces. The caller will have + * verified earlier that the requested namespaces are + * supported on this kernel. We don't report errors here + * if a namespace is requested that isn't supported. + */ +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + ret = validate_ns(nsset, &user_ns->ns); + if (ret) + goto out; + } +#endif + + if (flags & CLONE_NEWNS) { + ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns)); + if (ret) + goto out; + } + +#ifdef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) { + ret = validate_ns(nsset, &nsp->uts_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) { + ret = validate_ns(nsset, &nsp->ipc_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + ret = validate_ns(nsset, &pid_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) { + ret = validate_ns(nsset, &nsp->cgroup_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_NET_NS + if (flags & CLONE_NEWNET) { + ret = validate_ns(nsset, &nsp->net_ns->ns); + if (ret) + goto out; + } +#endif + +out: + if (pid_ns) + put_pid_ns(pid_ns); + if (nsp) + put_nsproxy(nsp); + put_user_ns(user_ns); + + return ret; +} + /* * This is the point of no return. There are just a few namespaces * that do some actual work here and it's sufficiently minimal that @@ -316,6 +496,12 @@ static void commit_nsset(struct nsset *nsset) } #endif + /* We only need to commit if we have used a temporary fs_struct. */ + if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) { + set_fs_root(me->fs, &nsset->fs->root); + set_fs_pwd(me->fs, &nsset->fs->pwd); + } + #ifdef CONFIG_IPC_NS if (flags & CLONE_NEWIPC) exit_sem(me); @@ -326,27 +512,38 @@ static void commit_nsset(struct nsset *nsset) nsset->nsproxy = NULL; } -SYSCALL_DEFINE2(setns, int, fd, int, nstype) +SYSCALL_DEFINE2(setns, int, fd, int, flags) { struct file *file; - struct ns_common *ns; + struct ns_common *ns = NULL; struct nsset nsset = {}; - int err; - - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); + int err = 0; - err = -EINVAL; - ns = get_proc_ns(file_inode(file)); - if (nstype && (ns->ops->type != nstype)) + file = fget(fd); + if (!file) + return -EBADF; + + if (proc_ns_file(file)) { + ns = get_proc_ns(file_inode(file)); + if (flags && (ns->ops->type != flags)) + err = -EINVAL; + flags = ns->ops->type; + } else if (!IS_ERR(pidfd_pid(file))) { + err = check_setns_flags(flags); + } else { + err = -EBADF; + } + if (err) goto out; - err = prepare_nsset(ns->ops->type, &nsset); + err = prepare_nsset(flags, &nsset); if (err) goto out; - err = ns->ops->install(&nsset, ns); + if (proc_ns_file(file)) + err = validate_ns(&nsset, ns); + else + err = validate_nsset(&nsset, file->private_data); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); -- cgit v1.2.3 From c9d64a1b2d0b9c81aef5e907bbeff549fd844d13 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:33 -0700 Subject: rcuwait: Fix stale wake call name in comment The 'trywake' name was renamed to simply 'wake', update the comment. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-2-dave@stgolabs.net> Signed-off-by: Paolo Bonzini --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 389a88cb3081..9f9015f3f6b0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -236,7 +236,7 @@ void rcuwait_wake_up(struct rcuwait *w) /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called - * rcuwait_trywake() in the first place. Pairs with set_current_state() + * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE -- cgit v1.2.3 From 9d9a6ebfea329cadeda87544dceae01e9c27aaef Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:34 -0700 Subject: rcuwait: Let rcuwait_wake_up() return whether or not a task was awoken Propagating the return value of wake_up_process() back to the caller can come in handy for future users, such as for statistics or accounting purposes. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-3-dave@stgolabs.net> Signed-off-by: Paolo Bonzini --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9f9015f3f6b0..f3beb637acf7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -227,8 +227,9 @@ repeat: goto repeat; } -void rcuwait_wake_up(struct rcuwait *w) +int rcuwait_wake_up(struct rcuwait *w) { + int ret = 0; struct task_struct *task; rcu_read_lock(); @@ -248,8 +249,10 @@ void rcuwait_wake_up(struct rcuwait *w) task = rcu_dereference(w->task); if (task) - wake_up_process(task); + ret = wake_up_process(task); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); -- cgit v1.2.3 From 2e3ed68bfcd9c5ca2cf8b88ba23a34992ccd0b1f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:18 -0700 Subject: bpf: Add comments to interpret bpf_prog return values Add a short comment in bpf_iter_run_prog() function to explain how bpf_prog return value is converted to seq_ops->show() return value: bpf_prog return seq_ops()->show() return 0 0 1 -EAGAIN When show() return value is -EAGAIN, the current bpf_seq_read() will end. If the current seq_file buffer is empty, -EAGAIN will return to user space. Otherwise, the buffer will be copied to user space. In both cases, the next bpf_seq_read() call will try to show the same object which returned -EAGAIN previously. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180218.2949517-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 30efd15cd4a0..0a45a6cdfabd 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -526,5 +526,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) migrate_enable(); rcu_read_unlock(); + /* bpf program can only return 0 or 1: + * 0 : okay + * 1 : retry the same object + * The bpf_iter_run_prog() return value + * will be seq_ops->show() return value. + */ return ret == 0 ? 0 : -EAGAIN; } -- cgit v1.2.3 From 15172a46fa2796c1a1358a36babd31274716ed41 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:19 -0700 Subject: bpf: net: Refactor bpf_iter target registration Currently bpf_iter_reg_target takes parameters from target and allocates memory to save them. This is really not necessary, esp. in the future we may grow information passed from targets to bpf_iter manager. The patch refactors the code so target reg_info becomes static and bpf_iter manager can just take a reference to it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200513180219.2949605-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 36 +++++++++++++++++------------------- kernel/bpf/map_iter.c | 18 +++++++++--------- kernel/bpf/task_iter.c | 30 ++++++++++++++++-------------- 3 files changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0a45a6cdfabd..051fb8cab62a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -8,11 +8,7 @@ struct bpf_iter_target_info { struct list_head list; - const char *target; - const struct seq_operations *seq_ops; - bpf_iter_init_seq_priv_t init_seq_private; - bpf_iter_fini_seq_priv_t fini_seq_private; - u32 seq_priv_size; + const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; @@ -222,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->fini_seq_private) - iter_priv->tinfo->fini_seq_private(seq->private); + if (iter_priv->tinfo->reg_info->fini_seq_private) + iter_priv->tinfo->reg_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -238,7 +234,12 @@ const struct file_operations bpf_iter_fops = { .release = iter_release, }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +/* The argument reg_info will be cached in bpf_iter_target_info. + * The common practice is to declare target reg_info as + * a const static variable and passed as an argument to + * bpf_iter_reg_target(). + */ +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -246,11 +247,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) if (!tinfo) return -ENOMEM; - tinfo->target = reg_info->target; - tinfo->seq_ops = reg_info->seq_ops; - tinfo->init_seq_private = reg_info->init_seq_private; - tinfo->fini_seq_private = reg_info->fini_seq_private; - tinfo->seq_priv_size = reg_info->seq_priv_size; + tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); @@ -267,7 +264,7 @@ void bpf_iter_unreg_target(const char *target) mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->target)) { + if (!strcmp(target, tinfo->reg_info->target)) { list_del(&tinfo->list); kfree(tinfo); found = true; @@ -303,7 +300,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) supported = true; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { cache_btf_id(tinfo, prog); supported = true; break; @@ -431,15 +428,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + tinfo->reg_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->init_seq_private) { - err = tinfo->init_seq_private(priv_data->target_private); + if (tinfo->reg_info->init_seq_private) { + err = tinfo->reg_info->init_seq_private(priv_data->target_private); if (err) goto release_seq_file; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8162e0c00b9f..c6216a5fe56e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -81,17 +81,17 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; +static const struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + static int __init bpf_map_iter_init(void) { - struct bpf_iter_reg reg_info = { - .target = "bpf_map", - .seq_ops = &bpf_map_seq_ops, - .init_seq_private = NULL, - .fini_seq_private = NULL, - .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&bpf_map_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index aeed662d8451..bd7bfd83d9e0 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -306,22 +306,24 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +static const struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static const struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + static int __init task_iter_init(void) { - struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", - .seq_ops = &task_file_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), - }; - struct bpf_iter_reg task_reg_info = { - .target = "task", - .seq_ops = &task_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), - }; int ret; ret = bpf_iter_reg_target(&task_reg_info); -- cgit v1.2.3 From ab2ee4fcb9d61fd57db70db694adbcf54662bd80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:20 -0700 Subject: bpf: Change func bpf_iter_unreg_target() signature Change func bpf_iter_unreg_target() parameter from target name to target reg_info, similar to bpf_iter_reg_target(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180220.2949737-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 051fb8cab62a..644f8626b2c0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -257,14 +257,14 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) return 0; } -void bpf_iter_unreg_target(const char *target) +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->reg_info->target)) { + if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; -- cgit v1.2.3 From 3c32cc1bceba8a1755dc35cd97516f6c67856844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:21 -0700 Subject: bpf: Enable bpf_iter targets registering ctx argument types Commit b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") adds a field btf_id_or_null_non0_off to bpf_prog->aux structure to indicate that the first ctx argument is PTR_TO_BTF_ID reg_type and all others are PTR_TO_BTF_ID_OR_NULL. This approach does not really scale if we have other different reg types in the future, e.g., a pointer to a buffer. This patch enables bpf_iter targets registering ctx argument reg types which may be different from the default one. For example, for pointers to structures, the default reg_type is PTR_TO_BTF_ID for tracing program. The target can register a particular pointer type as PTR_TO_BTF_ID_OR_NULL which can be used by the verifier to enforce accesses. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180221.2949882-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 5 +++++ kernel/bpf/btf.c | 15 ++++++++++----- kernel/bpf/map_iter.c | 5 +++++ kernel/bpf/task_iter.c | 12 ++++++++++++ kernel/bpf/verifier.c | 1 - 5 files changed, 32 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 644f8626b2c0..dd612b80b9fe 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -308,6 +308,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); + if (supported) { + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; + } + return supported; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcd233139294..58c9af1d4808 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; - int ret; + int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3790,10 +3790,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - if (off != 0 && prog->aux->btf_id_or_null_non0_off) - info->reg_type = PTR_TO_BTF_ID_OR_NULL; - else - info->reg_type = PTR_TO_BTF_ID; + info->reg_type = PTR_TO_BTF_ID; + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off) { + info->reg_type = ctx_arg_info->reg_type; + break; + } + } if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c6216a5fe56e..c69071e334bf 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -87,6 +87,11 @@ static const struct bpf_iter_reg bpf_map_reg_info = { .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map, map), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_map_iter_init(void) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index bd7bfd83d9e0..a9b7264dda08 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -312,6 +312,11 @@ static const struct bpf_iter_reg task_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task, task), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static const struct bpf_iter_reg task_file_reg_info = { @@ -320,6 +325,13 @@ static const struct bpf_iter_reg task_file_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_file, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_file, file), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init task_iter_init(void) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a1826c76bb6..a3f2af756fd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10652,7 +10652,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; - prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; -- cgit v1.2.3 From c70f34a8ac66c2cb05593ef5760142e5f862a9b4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 May 2020 22:51:37 -0700 Subject: bpf: Fix bpf_iter's task iterator logic task_seq_get_next might stop prematurely if get_pid_task() fails to get task_struct. Failure to do so doesn't mean that there are no more tasks with higher pids. Procfs's iteration algorithm (see next_tgid in fs/proc/base.c) does a retry in such case. After this fix, instead of stopping prematurely after about 300 tasks on my server, bpf_iter program now returns >4000, which sounds much closer to reality. Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200514055137.1564581-1-andriin@fb.com --- kernel/bpf/task_iter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index a9b7264dda08..4dbf2b6035f8 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -27,9 +27,15 @@ static struct task_struct *task_seq_get_next(struct pid_namespace *ns, struct pid *pid; rcu_read_lock(); +retry: pid = idr_get_next(&ns->idr, tid); - if (pid) + if (pid) { task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ++*tid; + goto retry; + } + } rcu_read_unlock(); return task; -- cgit v1.2.3 From db612f749e2454c506f20155bba2871f0307d133 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:38 +0200 Subject: xdp: Cpumap redirect use frame_sz and increase skb_tailroom MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Knowing the memory size backing the packet/xdp_frame data area, and knowing it already have reserved room for skb_shared_info, simplifies using build_skb significantly. With this change we no-longer lie about the SKB truesize, but more importantly a significant larger skb_tailroom is now provided, e.g. when drivers uses a full PAGE_SIZE. This extra tailroom (in linear area) can be used by the network stack when coalescing SKBs (e.g. in skb_try_coalesce, see TCP cases where tcp_queue_rcv() can 'eat' skb). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945337822.97035.13557959180460986059.stgit@firesoul --- kernel/bpf/cpumap.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3fe0b006d2d2..a71790dab12d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - /* build_skb need to place skb_shared_info after SKB end, and - * also want to know the memory "truesize". Thus, need to - * know the memory frame size backing xdp_buff. - * - * XDP was designed to have PAGE_SIZE frames, but this - * assumption is not longer true with ixgbe and i40e. It - * would be preferred to set frame_size to 2048 or 4096 - * depending on the driver. - * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_frame); - * - * Instead, with info avail, skb_shared_info in placed after - * packet len. This, unfortunately fakes the truesize. - * Another disadvantage of this approach, the skb_shared_info - * is not at a fixed memory location, with mixed length - * packets, which is bad for cache-line hotness. + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_size = xdpf->frame_sz; pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb_around(skb, pkt_data_start, frame_size); -- cgit v1.2.3 From c69b470eb85798514723ffa2686da6d21198c0d0 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:49 +0100 Subject: kdb: constify sysrq_key_op With earlier commits, the API no longer discards the const-ness of the sysrq_key_op. As such we can add the notation. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Cc: Jason Wessel Cc: Daniel Thompson Cc: kgdb-bugreport@lists.sourceforge.net Acked-by: Daniel Thompson Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-9-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/debug/debug_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2b7c9b67931d..355bea54ca0e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -920,7 +920,7 @@ static void sysrq_handle_dbg(int key) kgdb_breakpoint(); } -static struct sysrq_key_op sysrq_dbg_op = { +static const struct sysrq_key_op sysrq_dbg_op = { .handler = sysrq_handle_dbg, .help_msg = "debug(g)", .action_msg = "DEBUG", -- cgit v1.2.3 From 6400b5a0f604298a03748b96693a77d12b479998 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:50 +0100 Subject: kernel/power: constify sysrq_key_op With earlier commits, the API no longer discards the const-ness of the sysrq_key_op. As such we can add the notation. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: linux-pm@vger.kernel.org Acked-by: Rafael J. Wysocki Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-10-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/power/poweroff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 6d475281c730..562aa0e450ed 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -29,7 +29,7 @@ static void handle_poweroff(int key) schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } -static struct sysrq_key_op sysrq_poweroff_op = { +static const struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, .help_msg = "poweroff(o)", .action_msg = "Power Off", -- cgit v1.2.3 From 0ca650c430404708a6e8bea75e5f92ce8448f098 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:51 +0100 Subject: rcu: constify sysrq_key_op With earlier commits, the API no longer discards the const-ness of the sysrq_key_op. As such we can add the notation. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: rcu@vger.kernel.org Reviewed-by: Paul E. McKenney Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-11-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/rcu/tree_stall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..4e6ed7b91f59 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -729,7 +729,7 @@ static void sysrq_show_rcu(int key) show_rcu_gp_kthreads(); } -static struct sysrq_key_op sysrq_rcudump_op = { +static const struct sysrq_key_op sysrq_rcudump_op = { .handler = sysrq_show_rcu, .help_msg = "show-rcu(y)", .action_msg = "Show RCU tree", -- cgit v1.2.3 From 2c78ee898d8f10ae6fb2fa23a3fbaec96b1b7366 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:54 -0700 Subject: bpf: Implement CAP_BPF Implement permissions as stated in uapi/linux/capability.h In order to do that the verifier allow_ptr_leaks flag is split into four flags and they are set as: env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); The first three currently equivalent to perfmon_capable(), since leaking kernel pointers and reading kernel memory via side channel attacks is roughly equivalent to reading kernel memory with cap_perfmon. 'bpf_capable' enables bounded loops, precision tracking, bpf to bpf calls and other verifier features. 'allow_ptr_leaks' enable ptr leaks, ptr conversions, subtraction of pointers. 'bypass_spec_v1' disables speculative analysis in the verifier, run time mitigations in bpf array, and enables indirect variable access in bpf programs. 'bypass_spec_v4' disables emission of sanitation code by the verifier. That means that the networking BPF program loaded with CAP_BPF + CAP_NET_ADMIN will have speculative checks done by the verifier and other spectre mitigation applied. Such networking BPF program will not be able to leak kernel pointers and will not be able to access arbitrary kernel memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-3-alexei.starovoitov@gmail.com --- kernel/bpf/arraymap.c | 10 ++--- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/hashtab.c | 4 +- kernel/bpf/helpers.c | 4 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/syscall.c | 89 +++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 37 +++++++++--------- kernel/trace/bpf_trace.c | 3 ++ 14 files changed, 109 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d5bb0d983b2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa11de67315..c40ff4cf9880 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a71790dab12d..8b85bfddfac7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5c0290e0696e..886949fdcece 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 30e1373fd437..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de2a75500233..79bcd8d056d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return skb_flow_dissector_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3f2af756fd6..180933f6fba9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8353,7 +8353,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d961428fb5b6..9a84d7fb4869 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); -- cgit v1.2.3 From d08b9f0ca6605e13dcb48f04e55a30545b3c71eb Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:07 -0700 Subject: scs: Add support for Clang's Shadow Call Stack (SCS) This change adds generic support for Clang's Shadow Call Stack, which uses a shadow stack to protect return addresses from being overwritten by an attacker. Details are available here: https://clang.llvm.org/docs/ShadowCallStack.html Note that security guarantees in the kernel differ from the ones documented for user space. The kernel must store addresses of shadow stacks in memory, which means an attacker capable reading and writing arbitrary memory may be able to locate them and hijack control flow by modifying the stacks. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Reviewed-by: Miguel Ojeda [will: Numerous cosmetic changes] Signed-off-by: Will Deacon --- kernel/Makefile | 1 + kernel/fork.c | 9 ++++++++ kernel/sched/core.c | 2 ++ kernel/scs.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+) create mode 100644 kernel/scs.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..c332eb9d4841 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..f6339f9d232d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include @@ -456,6 +457,8 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + scs_release(tsk); + #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -840,6 +843,8 @@ void __init fork_init(void) NULL, free_vm_stack_cache); #endif + scs_init(); + lockdep_init_task(&init_task); uprobes_init(); } @@ -899,6 +904,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_stack; + err = scs_prepare(tsk, node); + if (err) + goto free_stack; + #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..934e03cfaec7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -6040,6 +6041,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; + scs_task_reset(idle); kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP diff --git a/kernel/scs.c b/kernel/scs.c new file mode 100644 index 000000000000..38f8f31c9451 --- /dev/null +++ b/kernel/scs.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include +#include +#include +#include + +static struct kmem_cache *scs_cache; + +static void *scs_alloc(int node) +{ + void *s; + + s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + if (s) { + *__scs_magic(s) = SCS_END_MAGIC; + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + } + + return s; +} + +static void scs_free(void *s) +{ + kasan_unpoison_object_data(scs_cache, s); + kmem_cache_free(scs_cache, s); +} + +void __init scs_init(void) +{ + scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); +} + +int scs_prepare(struct task_struct *tsk, int node) +{ + void *s = scs_alloc(node); + + if (!s) + return -ENOMEM; + + task_scs(tsk) = s; + task_scs_offset(tsk) = 0; + + return 0; +} + +void scs_release(struct task_struct *tsk) +{ + void *s = task_scs(tsk); + + if (!s) + return; + + WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_free(s); +} -- cgit v1.2.3 From 628d06a48f57c36abdc2a024930212e654a501b7 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:08 -0700 Subject: scs: Add page accounting for shadow call stack allocations This change adds accounting for the memory allocated for shadow stacks. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Acked-by: Will Deacon Signed-off-by: Will Deacon --- kernel/scs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 38f8f31c9451..6d2f983ac54e 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -6,8 +6,10 @@ */ #include +#include #include #include +#include #include static struct kmem_cache *scs_cache; @@ -40,6 +42,17 @@ void __init scs_init(void) scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); } +static struct page *__scs_page(struct task_struct *tsk) +{ + return virt_to_page(task_scs(tsk)); +} + +static void scs_account(struct task_struct *tsk, int account) +{ + mod_zone_page_state(page_zone(__scs_page(tsk)), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / 1024)); +} + int scs_prepare(struct task_struct *tsk, int node) { void *s = scs_alloc(node); @@ -49,6 +62,7 @@ int scs_prepare(struct task_struct *tsk, int node) task_scs(tsk) = s; task_scs_offset(tsk) = 0; + scs_account(tsk, 1); return 0; } @@ -61,5 +75,6 @@ void scs_release(struct task_struct *tsk) return; WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_account(tsk, -1); scs_free(s); } -- cgit v1.2.3 From 5bbaf9d1fcb9be696ee9a61636ab6803556c70f2 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:09 -0700 Subject: scs: Add support for stack usage debugging Implements CONFIG_DEBUG_STACK_USAGE for shadow stacks. When enabled, also prints out the highest shadow stack usage per process. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Acked-by: Will Deacon [will: rewrote most of scs_check_usage()] Signed-off-by: Will Deacon --- kernel/scs.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 6d2f983ac54e..9389c28f0853 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -63,10 +63,37 @@ int scs_prepare(struct task_struct *tsk, int node) task_scs(tsk) = s; task_scs_offset(tsk) = 0; scs_account(tsk, 1); - return 0; } +static void scs_check_usage(struct task_struct *tsk) +{ + static unsigned long highest; + + unsigned long *p, prev, curr = highest, used = 0; + + if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) + return; + + for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + if (!READ_ONCE_NOCHECK(*p)) + break; + used++; + } + + while (used > curr) { + prev = cmpxchg_relaxed(&highest, curr, used); + + if (prev == curr) { + pr_info("%s (%d): highest shadow stack usage: %lu bytes\n", + tsk->comm, task_pid_nr(tsk), used); + break; + } + + curr = prev; + } +} + void scs_release(struct task_struct *tsk) { void *s = task_scs(tsk); @@ -75,6 +102,7 @@ void scs_release(struct task_struct *tsk) return; WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_check_usage(tsk); scs_account(tsk, -1); scs_free(s); } -- cgit v1.2.3 From 2f4c33063ad713e3a5b63002cf8362846e78bd71 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 15 May 2020 18:02:22 +0200 Subject: docs: sysctl/kernel: document ngroups_max This is a read-only export of NGROUPS_MAX, so this patch also changes the declarations in kernel/sysctl.c to const. Signed-off-by: Stephen Kitt Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20200515160222.7994-1-steve@sk2.org Signed-off-by: Jonathan Corbet --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..2ba9f449d273 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -146,7 +146,7 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static int maxolduid = 65535; static int minolduid; -static int ngroups_max = NGROUPS_MAX; +static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; /* @@ -883,7 +883,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "ngroups_max", - .data = &ngroups_max, + .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, -- cgit v1.2.3 From b5945214b76a1f22929481724ffd448000ede914 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 4 May 2020 10:50:17 -0700 Subject: kernel/cpu_pm: Fix uninitted local in cpu_pm cpu_pm_notify() is basically a wrapper of notifier_call_chain(). notifier_call_chain() doesn't initialize *nr_calls to 0 before it starts incrementing it--presumably it's up to the callers to do this. Unfortunately the callers of cpu_pm_notify() don't init *nr_calls. This potentially means you could get too many or two few calls to CPU_PM_ENTER_FAILED or CPU_CLUSTER_PM_ENTER_FAILED depending on the luck of the stack. Let's fix this. Fixes: ab10023e0088 ("cpu_pm: Add cpu power management notifiers") Cc: stable@vger.kernel.org Cc: Rafael J. Wysocki Reviewed-by: Stephen Boyd Reviewed-by: Greg Kroah-Hartman Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200504104917.v6.3.I2d44fc0053d019f239527a4e5829416714b7e299@changeid Signed-off-by: Bjorn Andersson --- kernel/cpu_pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index cbca6879ab7d..44a259338e33 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); */ int cpu_pm_enter(void) { - int nr_calls; + int nr_calls = 0; int ret = 0; ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); */ int cpu_cluster_pm_enter(void) { - int nr_calls; + int nr_calls = 0; int ret = 0; ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); -- cgit v1.2.3 From 2ec0616e870f0f2aa8353e0de057f0c2dc8d52d5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 16 May 2020 00:39:18 +0200 Subject: bpf: Fix check_return_code to only allow [0,1] in trace_iter progs As per 15d83c4d7cef ("bpf: Allow loading of a bpf_iter program") we only allow a range of [0,1] for return codes. Therefore BPF_TRACE_ITER relies on the default tnum_range(0, 1) which is set in range var. On recent merge of net into net-next commit e92888c72fbd ("bpf: Enforce returning 0 for fentry/fexit progs") got pulled in and caused a merge conflict with the changes from 15d83c4d7cef. The resolution had a snall hiccup in that it removed the [0,1] range restriction again so that BPF_TRACE_ITER would have no enforcement. Fix it by adding it back. Fixes: da07f52d3caf ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25b14ee0e26d..9c7d67d65d8c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7120,10 +7120,11 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_TRACE_FEXIT: range = tnum_const(0); break; - case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: return 0; + case BPF_TRACE_ITER: + break; default: return -ENOTSUPP; } -- cgit v1.2.3 From 870c153cf0e6df1b8b5226af41b19945e8e0d143 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 13 May 2020 18:02:23 +0200 Subject: blktrace: Report pid with note messages Currently informational messages within block trace do not have PID information of the process reporting the message included. With BFQ it is sometimes useful to have the information and there's no good reason to omit the information from the trace. So just fill in pid information when generating note message. Signed-off-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Acked-by: Paolo Valente Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ca39dc3230cb..ea47f2084087 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -170,10 +170,10 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } -- cgit v1.2.3 From 5c8f77a278737a6af44a892f0700d9aadb2b0de0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 14 May 2020 10:39:00 +0200 Subject: irqdomain: Make irq_domain_reset_irq_data() available to non-hierarchical users irq_domain_reset_irq_data() doesn't modify the parent data, so it can be made available even if irq domain hierarchy is not being built. We'll subsequently use it in irq_sim code. Signed-off-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20200514083901.23445-2-brgl@bgdev.pl --- kernel/irq/irqdomain.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 35b8d97c3a1d..e2aa128ea3ee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1047,6 +1047,18 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, return virq; } +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); + #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy @@ -1247,18 +1259,6 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, } EXPORT_SYMBOL(irq_domain_set_info); -/** - * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data - * @irq_data: The pointer to irq_data - */ -void irq_domain_reset_irq_data(struct irq_data *irq_data) -{ - irq_data->hwirq = 0; - irq_data->chip = &no_irq_chip; - irq_data->chip_data = NULL; -} -EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); - /** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match -- cgit v1.2.3 From 337cbeb2c13eb4cab84f576fd402d7ae4ed31ae1 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 14 May 2020 10:39:01 +0200 Subject: genirq/irq_sim: Simplify the API The interrupt simulator API exposes a lot of custom data structures and functions and doesn't reuse the interfaces already exposed by the irq subsystem. This patch tries to address it. We hide all the simulator-related data structures from users and instead rely on the well-known irq domain. When creating the interrupt simulator the user receives a pointer to a newly created irq_domain and can use it to create mappings for simulated interrupts. It is also possible to pass a handle to fwnode when creating the simulator domain and retrieve it using irq_find_matching_fwnode(). The irq_sim_fire() function is dropped as well. Instead we implement the irq_get/set_irqchip_state interface. We modify the two modules that use the simulator at the same time as adding these changes in order to reduce the intermediate bloat that would result when trying to migrate the drivers in separate patches. Signed-off-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Acked-by: Jonathan Cameron #for IIO Link: https://lore.kernel.org/r/20200514083901.23445-3-brgl@bgdev.pl --- kernel/irq/Kconfig | 1 + kernel/irq/irq_sim.c | 267 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 166 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20d501af4f2e..d63c324895ea 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -72,6 +72,7 @@ config IRQ_DOMAIN config IRQ_SIM bool select IRQ_WORK + select IRQ_DOMAIN # Support for hierarchical irq domains config IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index b992f88c5613..48006608baf0 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,14 +1,31 @@ // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017-2018 Bartosz Golaszewski + * Copyright (C) 2020 Bartosz Golaszewski */ -#include -#include #include +#include +#include +#include +#include + +struct irq_sim_work_ctx { + struct irq_work work; + int irq_base; + unsigned int irq_count; + unsigned long *pending; + struct irq_domain *domain; +}; + +struct irq_sim_irq_ctx { + int irqnum; + bool enabled; + struct irq_sim_work_ctx *work_ctx; +}; struct irq_sim_devres { - struct irq_sim *sim; + struct irq_domain *domain; }; static void irq_sim_irqmask(struct irq_data *data) @@ -36,159 +53,205 @@ static int irq_sim_set_type(struct irq_data *data, unsigned int type) return 0; } +static int irq_sim_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) + *state = test_bit(hwirq, irq_ctx->work_ctx->pending); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int irq_sim_set_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) { + assign_bit(hwirq, irq_ctx->work_ctx->pending, state); + if (state) + irq_work_queue(&irq_ctx->work_ctx->work); + } + break; + default: + return -EINVAL; + } + + return 0; +} + static struct irq_chip irq_sim_irqchip = { - .name = "irq_sim", - .irq_mask = irq_sim_irqmask, - .irq_unmask = irq_sim_irqunmask, - .irq_set_type = irq_sim_set_type, + .name = "irq_sim", + .irq_mask = irq_sim_irqmask, + .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, + .irq_get_irqchip_state = irq_sim_get_irqchip_state, + .irq_set_irqchip_state = irq_sim_set_irqchip_state, }; static void irq_sim_handle_irq(struct irq_work *work) { struct irq_sim_work_ctx *work_ctx; unsigned int offset = 0; - struct irq_sim *sim; int irqnum; work_ctx = container_of(work, struct irq_sim_work_ctx, work); - sim = container_of(work_ctx, struct irq_sim, work_ctx); - while (!bitmap_empty(work_ctx->pending, sim->irq_count)) { + while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) { offset = find_next_bit(work_ctx->pending, - sim->irq_count, offset); + work_ctx->irq_count, offset); clear_bit(offset, work_ctx->pending); - irqnum = irq_sim_irqnum(sim, offset); + irqnum = irq_find_mapping(work_ctx->domain, offset); handle_simple_irq(irq_to_desc(irqnum)); } } +static int irq_sim_domain_map(struct irq_domain *domain, + unsigned int virq, irq_hw_number_t hw) +{ + struct irq_sim_work_ctx *work_ctx = domain->host_data; + struct irq_sim_irq_ctx *irq_ctx; + + irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL); + if (!irq_ctx) + return -ENOMEM; + + irq_set_chip(virq, &irq_sim_irqchip); + irq_set_chip_data(virq, irq_ctx); + irq_set_handler(virq, handle_simple_irq); + irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); + irq_ctx->work_ctx = work_ctx; + + return 0; +} + +static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq) +{ + struct irq_sim_irq_ctx *irq_ctx; + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + irq_ctx = irq_data_get_irq_chip_data(irqd); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(irqd); + kfree(irq_ctx); +} + +static const struct irq_domain_ops irq_sim_domain_ops = { + .map = irq_sim_domain_map, + .unmap = irq_sim_domain_unmap, +}; + /** - * irq_sim_init - Initialize the interrupt simulator: allocate a range of - * dummy interrupts. + * irq_domain_create_sim - Create a new interrupt simulator irq_domain and + * allocate a range of dummy interrupts. * - * @sim: The interrupt simulator object to initialize. - * @num_irqs: Number of interrupts to allocate + * @fnode: struct fwnode_handle to be associated with this domain. + * @num_irqs: Number of interrupts to allocate. * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) +struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, + unsigned int num_irqs) { - int i; + struct irq_sim_work_ctx *work_ctx; - sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL); - if (!sim->irqs) - return -ENOMEM; + work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL); + if (!work_ctx) + goto err_out; - sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); - if (sim->irq_base < 0) { - kfree(sim->irqs); - return sim->irq_base; - } + work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!work_ctx->pending) + goto err_free_work_ctx; - sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL); - if (!sim->work_ctx.pending) { - kfree(sim->irqs); - irq_free_descs(sim->irq_base, num_irqs); - return -ENOMEM; - } + work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs, + &irq_sim_domain_ops, + work_ctx); + if (!work_ctx->domain) + goto err_free_bitmap; - for (i = 0; i < num_irqs; i++) { - sim->irqs[i].irqnum = sim->irq_base + i; - sim->irqs[i].enabled = false; - irq_set_chip(sim->irq_base + i, &irq_sim_irqchip); - irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]); - irq_set_handler(sim->irq_base + i, &handle_simple_irq); - irq_modify_status(sim->irq_base + i, - IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); - } + work_ctx->irq_count = num_irqs; + init_irq_work(&work_ctx->work, irq_sim_handle_irq); - init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); - sim->irq_count = num_irqs; + return work_ctx->domain; - return sim->irq_base; +err_free_bitmap: + bitmap_free(work_ctx->pending); +err_free_work_ctx: + kfree(work_ctx); +err_out: + return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL_GPL(irq_sim_init); +EXPORT_SYMBOL_GPL(irq_domain_create_sim); /** - * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt - * descriptors and allocated memory. + * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free + * the interrupt descriptors and allocated memory. * - * @sim: The interrupt simulator to tear down. + * @domain: The interrupt simulator domain to tear down. */ -void irq_sim_fini(struct irq_sim *sim) +void irq_domain_remove_sim(struct irq_domain *domain) { - irq_work_sync(&sim->work_ctx.work); - bitmap_free(sim->work_ctx.pending); - irq_free_descs(sim->irq_base, sim->irq_count); - kfree(sim->irqs); + struct irq_sim_work_ctx *work_ctx = domain->host_data; + + irq_work_sync(&work_ctx->work); + bitmap_free(work_ctx->pending); + kfree(work_ctx); + + irq_domain_remove(domain); } -EXPORT_SYMBOL_GPL(irq_sim_fini); +EXPORT_SYMBOL_GPL(irq_domain_remove_sim); -static void devm_irq_sim_release(struct device *dev, void *res) +static void devm_irq_domain_release_sim(struct device *dev, void *res) { struct irq_sim_devres *this = res; - irq_sim_fini(this->sim); + irq_domain_remove_sim(this->domain); } /** - * irq_sim_init - Initialize the interrupt simulator for a managed device. + * devm_irq_domain_create_sim - Create a new interrupt simulator for + * a managed device. * * @dev: Device to initialize the simulator object for. - * @sim: The interrupt simulator object to initialize. + * @fnode: struct fwnode_handle to be associated with this domain. * @num_irqs: Number of interrupts to allocate * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, - unsigned int num_irqs) +struct irq_domain *devm_irq_domain_create_sim(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs) { struct irq_sim_devres *dr; - int rv; - dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc(devm_irq_domain_release_sim, + sizeof(*dr), GFP_KERNEL); if (!dr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - rv = irq_sim_init(sim, num_irqs); - if (rv < 0) { + dr->domain = irq_domain_create_sim(fwnode, num_irqs); + if (IS_ERR(dr->domain)) { devres_free(dr); - return rv; + return dr->domain; } - dr->sim = sim; devres_add(dev, dr); - - return rv; -} -EXPORT_SYMBOL_GPL(devm_irq_sim_init); - -/** - * irq_sim_fire - Enqueue an interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt which should be fired. - */ -void irq_sim_fire(struct irq_sim *sim, unsigned int offset) -{ - if (sim->irqs[offset].enabled) { - set_bit(offset, sim->work_ctx.pending); - irq_work_queue(&sim->work_ctx.work); - } -} -EXPORT_SYMBOL_GPL(irq_sim_fire); - -/** - * irq_sim_irqnum - Get the allocated number of a dummy interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt for which to retrieve - * the number. - */ -int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset) -{ - return sim->irqs[offset].irqnum; + return dr->domain; } -EXPORT_SYMBOL_GPL(irq_sim_irqnum); +EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); -- cgit v1.2.3 From fdb1b5e08929f21d29aedd447233b32f497b8999 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 18 May 2020 06:19:25 -0600 Subject: Revert "docs: sysctl/kernel: document ngroups_max" This reverts commit 2f4c33063ad713e3a5b63002cf8362846e78bd71. The changes here were fine, but there's a non-documentation change to sysctl.c that makes messes elsewhere; those changes should have been done independently. Signed-off-by: Jonathan Corbet --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2ba9f449d273..8a176d8727a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -146,7 +146,7 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static int maxolduid = 65535; static int minolduid; -static const int ngroups_max = NGROUPS_MAX; +static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; /* @@ -883,7 +883,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "ngroups_max", - .data = (void *)&ngroups_max, + .data = &ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, -- cgit v1.2.3 From 29da4f91c0c1fbda12b8a31be0d564930208c92e Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 14 May 2020 16:47:39 +0530 Subject: powerpc/watchpoint: Don't allow concurrent perf and ptrace events With Book3s DAWR, ptrace and perf watchpoints on powerpc behaves differently. Ptrace watchpoint works in one-shot mode and generates signal before executing instruction. It's ptrace user's job to single-step the instruction and re-enable the watchpoint. OTOH, in case of perf watchpoint, kernel emulates/single-steps the instruction and then generates event. If perf and ptrace creates two events with same or overlapping address ranges, it's ambiguous to decide who should single-step the instruction. Because of this issue, don't allow perf and ptrace watchpoint at the same time if their address range overlaps. Signed-off-by: Ravi Bangoria Signed-off-by: Michael Ellerman Reviewed-by: Michael Neuling Link: https://lore.kernel.org/r/20200514111741.97993-15-ravi.bangoria@linux.ibm.com --- kernel/events/hw_breakpoint.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3cc8416ec844..b48d7039a015 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -213,6 +213,15 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, list_del(&bp->hw.bp_list); } +__weak int arch_reserve_bp_slot(struct perf_event *bp) +{ + return 0; +} + +__weak void arch_release_bp_slot(struct perf_event *bp) +{ +} + /* * Function to perform processor-specific cleanup during unregistration */ @@ -270,6 +279,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) struct bp_busy_slots slots = {0}; enum bp_type_idx type; int weight; + int ret; /* We couldn't initialize breakpoint constraints on boot */ if (!constraints_initialized) @@ -294,6 +304,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; + ret = arch_reserve_bp_slot(bp); + if (ret) + return ret; + toggle_bp_slot(bp, true, type, weight); return 0; @@ -317,6 +331,8 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) enum bp_type_idx type; int weight; + arch_release_bp_slot(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); -- cgit v1.2.3 From 202164fbfa2b2ffa3e66b504e0f126ba9a745006 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:39 -0700 Subject: kgdb: Disable WARN_CONSOLE_UNLOCKED for all kgdb In commit 81eaadcae81b ("kgdboc: disable the console lock when in kgdb") we avoided the WARN_CONSOLE_UNLOCKED() yell when we were in kgdboc. That still works fine, but it turns out that we get a similar yell when using other I/O drivers. One example is the "I/O driver" for the kgdb test suite (kgdbts). When I enabled that I again got the same yells. Even though "kgdbts" doesn't actually interact with the user over the console, using it still causes kgdb to print to the consoles. That trips the same warning: con_is_visible+0x60/0x68 con_scroll+0x110/0x1b8 lf+0x4c/0xc8 vt_console_print+0x1b8/0x348 vkdb_printf+0x320/0x89c kdb_printf+0x68/0x90 kdb_main_loop+0x190/0x860 kdb_stub+0x2cc/0x3ec kgdb_cpu_enter+0x268/0x744 kgdb_handle_exception+0x1a4/0x200 kgdb_compiled_brk_fn+0x34/0x44 brk_handler+0x7c/0xb8 do_debug_exception+0x1b4/0x228 Let's increment/decrement the "ignore_console_lock_warning" variable all the time when we enter the debugger. This will allow us to later revert commit 81eaadcae81b ("kgdboc: disable the console lock when in kgdb"). Signed-off-by: Douglas Anderson Reviewed-by: Greg Kroah-Hartman Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20200507130644.v4.1.Ied2b058357152ebcc8bf68edd6f20a11d98d7d4e@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2266ba27f27d..b542f36499c6 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -666,6 +666,8 @@ return_normal: if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) goto kgdb_restore; + atomic_inc(&ignore_console_lock_warning); + /* Call the I/O driver's pre_exception routine */ if (dbg_io_ops->pre_exception) dbg_io_ops->pre_exception(); @@ -738,6 +740,8 @@ cpu_master_loop: if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); + atomic_dec(&ignore_console_lock_warning); + if (!kgdb_single_step) { raw_spin_unlock(&dbg_slave_lock); /* Wait till all the CPUs have quit from the debugger. */ -- cgit v1.2.3 From 51189c7a7ed1b4ed4493e27275d466ff60406d3a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 14:11:05 +0100 Subject: arm64: scs: Store absolute SCS stack pointer value in thread_info Storing the SCS information in thread_info as a {base,offset} pair introduces an additional load instruction on the ret-to-user path, since the SCS stack pointer in x18 has to be converted back to an offset by subtracting the base. Replace the offset with the absolute SCS stack pointer value instead and avoid the redundant load. Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/scs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 9389c28f0853..5ff8663e4a67 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -60,8 +60,7 @@ int scs_prepare(struct task_struct *tsk, int node) if (!s) return -ENOMEM; - task_scs(tsk) = s; - task_scs_offset(tsk) = 0; + task_scs(tsk) = task_scs_sp(tsk) = s; scs_account(tsk, 1); return 0; } -- cgit v1.2.3 From bee348fab099b0f551caa874663e82a7f3bb64b3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 14:43:11 +0100 Subject: scs: Move accounting into alloc/free functions There's no need to perform the shadow stack page accounting independently of the lifetime of the underlying allocation, so call the accounting code from the {alloc,free}() functions and simplify the code in the process. Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/scs.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 5ff8663e4a67..aea841cd7586 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -14,25 +14,35 @@ static struct kmem_cache *scs_cache; +static void __scs_account(void *s, int account) +{ + struct page *scs_page = virt_to_page(s); + + mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / SZ_1K)); +} + static void *scs_alloc(int node) { - void *s; - - s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); - if (s) { - *__scs_magic(s) = SCS_END_MAGIC; - /* - * Poison the allocation to catch unintentional accesses to - * the shadow stack when KASAN is enabled. - */ - kasan_poison_object_data(scs_cache, s); - } + void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + + if (!s) + return NULL; + *__scs_magic(s) = SCS_END_MAGIC; + + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + __scs_account(s, 1); return s; } static void scs_free(void *s) { + __scs_account(s, -1); kasan_unpoison_object_data(scs_cache, s); kmem_cache_free(scs_cache, s); } @@ -42,17 +52,6 @@ void __init scs_init(void) scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); } -static struct page *__scs_page(struct task_struct *tsk) -{ - return virt_to_page(task_scs(tsk)); -} - -static void scs_account(struct task_struct *tsk, int account) -{ - mod_zone_page_state(page_zone(__scs_page(tsk)), NR_KERNEL_SCS_KB, - account * (SCS_SIZE / 1024)); -} - int scs_prepare(struct task_struct *tsk, int node) { void *s = scs_alloc(node); @@ -61,7 +60,6 @@ int scs_prepare(struct task_struct *tsk, int node) return -ENOMEM; task_scs(tsk) = task_scs_sp(tsk) = s; - scs_account(tsk, 1); return 0; } @@ -102,6 +100,5 @@ void scs_release(struct task_struct *tsk) WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); scs_check_usage(tsk); - scs_account(tsk, -1); scs_free(s); } -- cgit v1.2.3 From 88485be531f4aee841ddc53b56e2f6e6a338854d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 14:56:05 +0100 Subject: scs: Move scs_overflow_check() out of architecture code There is nothing architecture-specific about scs_overflow_check() as it's just a trivial wrapper around scs_corrupted(). For parity with task_stack_end_corrupted(), rename scs_corrupted() to task_scs_end_corrupted() and call it from schedule_debug() when CONFIG_SCHED_STACK_END_CHECK_is enabled, which better reflects its purpose as a debug feature to catch inadvertent overflow of the SCS. Finally, remove the unused scs_overflow_check() function entirely. This has absolutely no impact on architectures that do not support SCS (currently arm64 only). Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/sched/core.c | 3 +++ kernel/scs.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 934e03cfaec7..a1d815a11b90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3878,6 +3878,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP diff --git a/kernel/scs.c b/kernel/scs.c index aea841cd7586..faf0ecd7b893 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -98,7 +98,8 @@ void scs_release(struct task_struct *tsk) if (!s) return; - WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + WARN(task_scs_end_corrupted(tsk), + "corrupted shadow stack detected when freeing task\n"); scs_check_usage(tsk); scs_free(s); } -- cgit v1.2.3 From aa7a65ae5b8f459617e5ed1422301386e7f12274 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 16:15:46 +0100 Subject: scs: Remove references to asm/scs.h from core code asm/scs.h is no longer needed by the core code, so remove a redundant header inclusion and update the stale Kconfig text. Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/scs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index faf0ecd7b893..222a7a9ad543 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -10,7 +10,6 @@ #include #include #include -#include static struct kmem_cache *scs_cache; -- cgit v1.2.3 From b1a57bbfcc17c87e5cc76695ebb0565380c7501a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:42 -0700 Subject: kgdb: Delay "kgdbwait" to dbg_late_init() by default Using kgdb requires at least some level of architecture-level initialization. If nothing else, it relies on the architecture to pass breakpoints / crashes onto kgdb. On some architectures this all works super early, specifically it starts working at some point in time before Linux parses early_params's. On other architectures it doesn't. A survey of a few platforms: a) x86: Presumably it all works early since "ekgdboc" is documented to work here. b) arm64: Catching crashes works; with a simple patch breakpoints can also be made to work. c) arm: Nothing in kgdb works until paging_init() -> devicemaps_init() -> early_trap_init() Let's be conservative and, by default, process "kgdbwait" (which tells the kernel to drop into the debugger ASAP at boot) a bit later at dbg_late_init() time. If an architecture has tested it and wants to re-enable super early debugging, they can select the ARCH_HAS_EARLY_DEBUG KConfig option. We'll do this for x86 to start. It should be noted that dbg_late_init() is still called quite early in the system. Note that this patch doesn't affect when kgdb runs its init. If kgdb is set to initialize early it will still initialize when parsing early_param's. This patch _only_ inhibits the initial breakpoint from "kgdbwait". This means: * Without any extra patches arm64 platforms will at least catch crashes after kgdb inits. * arm platforms will catch crashes (and could handle a hardcoded kgdb_breakpoint()) any time after early_trap_init() runs, even before dbg_late_init(). Signed-off-by: Douglas Anderson Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20200507130644.v4.4.I3113aea1b08d8ce36dc3720209392ae8b815201b@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b542f36499c6..1cd8e8b41115 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -948,6 +948,14 @@ void kgdb_panic(const char *msg) kgdb_breakpoint(); } +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + pr_crit("Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + void __weak kgdb_arch_late(void) { } @@ -958,6 +966,9 @@ void __init dbg_late_init(void) if (kgdb_io_module_registered) kgdb_arch_late(); kdb_init(KDB_INIT_FULL); + + if (kgdb_io_module_registered && kgdb_break_asap) + kgdb_initial_breakpoint(); } static int @@ -1053,14 +1064,6 @@ void kgdb_schedule_breakpoint(void) } EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); -static void kgdb_initial_breakpoint(void) -{ - kgdb_break_asap = 0; - - pr_crit("Waiting for connection from remote gdb...\n"); - kgdb_breakpoint(); -} - /** * kgdb_register_io_module - register KGDB IO module * @new_dbg_io_ops: the io ops vector @@ -1097,7 +1100,8 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) /* Arm KGDB now. */ kgdb_register_callbacks(); - if (kgdb_break_asap) + if (kgdb_break_asap && + (!dbg_is_early || IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG))) kgdb_initial_breakpoint(); return 0; @@ -1167,7 +1171,8 @@ static int __init opt_kgdb_wait(char *str) kgdb_break_asap = 1; kdb_init(KDB_INIT_EARLY); - if (kgdb_io_module_registered) + if (kgdb_io_module_registered && + IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG)) kgdb_initial_breakpoint(); return 0; -- cgit v1.2.3 From 3ca676e4ca60d1834bb77535dafe24169cadacef Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:44 -0700 Subject: kgdb: Prevent infinite recursive entries to the debugger If we detect that we recursively entered the debugger we should hack our I/O ops to NULL so that the panic() in the next line won't actually cause another recursion into the debugger. The first line of kgdb_panic() will check this and return. Signed-off-by: Douglas Anderson Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20200507130644.v4.6.I89de39f68736c9de610e6f241e68d8dbc44bc266@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1cd8e8b41115..e2d67b163fb6 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -530,6 +530,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) if (exception_level > 1) { dump_stack(); + kgdb_io_module_registered = false; panic("Recursive entry to debugger"); } -- cgit v1.2.3 From 220995622da5317714b5fe659165735f7b44b87e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:46 -0700 Subject: kgdboc: Add kgdboc_earlycon to support early kgdb using boot consoles We want to enable kgdb to debug the early parts of the kernel. Unfortunately kgdb normally is a client of the tty API in the kernel and serial drivers don't register to the tty layer until fairly late in the boot process. Serial drivers do, however, commonly register a boot console. Let's enable the kgdboc driver to work with boot consoles to provide early debugging. This change co-opts the existing read() function pointer that's part of "struct console". It's assumed that if a boot console (with the flag CON_BOOT) has implemented read() that both the read() and write() function are polling functions. That means they work without interrupts and read() will return immediately (with 0 bytes read) if there's nothing to read. This should be a safe assumption since it appears that no current boot consoles implement read() right now and there seems no reason to do so unless they wanted to support "kgdboc_earlycon". The normal/expected way to make all this work is to use "kgdboc_earlycon" and "kgdboc" together. You should point them both to the same physical serial connection. At boot time, as the system transitions from the boot console to the normal console (and registers a tty), kgdb will switch over. One awkward part of all this, though, is that there can be a window where the boot console goes away and we can't quite transtion over to the main kgdboc that uses the tty layer. There are two main problems: 1. The act of registering the tty doesn't cause any call into kgdboc so there is a window of time when the tty is there but kgdboc's init code hasn't been called so we can't transition to it. 2. On some serial drivers the normal console inits (and replaces the boot console) quite early in the system. Presumably these drivers were coded up before earlycon worked as well as it does today and probably they don't need to do this anymore, but it causes us problems nontheless. Problem #1 is not too big of a deal somewhat due to the luck of probe ordering. kgdboc is last in the tty/serial/Makefile so its probe gets right after all other tty devices. It's not fun to rely on this, but it does work for the most part. Problem #2 is a big deal, but only for some serial drivers. Other serial drivers end up registering the console (which gets rid of the boot console) and tty at nearly the same time. The way we'll deal with the window when the system has stopped using the boot console and the time when we're setup using the tty is to keep using the boot console. This may sound surprising, but it has been found to work well in practice. If it doesn't work, it shouldn't be too hard for a given serial driver to make it keep working. Specifically, it's expected that the read()/write() function provided in the boot console should be the same (or nearly the same) as the normal kgdb polling functions. That means continuing to use them should work just fine. To make things even more likely to work work we'll also trap the recently added exit() function in the boot console we're using and delay any calls to it until we're all done with the boot console. NOTE: there could be ways to use all this in weird / unexpected ways. If you do something like this, it's a bit of a buyer beware situation. Specifically: - If you specify only "kgdboc_earlycon" but not "kgdboc" then (depending on your serial driver) things will probably work OK, but you'll get a warning printed the first time you use kgdb after the boot console is gone. You'd only be able to do this, of course, if the serial driver you're running atop provided an early boot console. - If your "kgdboc_earlycon" and "kgdboc" devices are not the same device things should work OK, but it'll be your job to switch over which device you're monitoring (including figuring out how to switch over gdb in-flight if you're using it). When trying to enable "kgdboc_earlycon" it should be noted that the names that are registered through the boot console layer and the tty layer are not the same for the same port. For example when debugging on one board I'd need to pass "kgdboc_earlycon=qcom_geni kgdboc=ttyMSM0" to enable things properly. Since digging up the boot console name is a pain and there will rarely be more than one boot console enabled, you can provide the "kgdboc_earlycon" parameter without specifying the name of the boot console. In this case we'll just pick the first boot that implements read() that we find. This new "kgdboc_earlycon" parameter should be contrasted to the existing "ekgdboc" parameter. While both provide a way to debug very early, the usage and mechanisms are quite different. Specifically "kgdboc_earlycon" is meant to be used in tandem with "kgdboc" and there is a transition from one to the other. The "ekgdboc" parameter, on the other hand, replaces the "kgdboc" parameter. It runs the same logic as the "kgdboc" parameter but just relies on your TTY driver being present super early. The only known usage of the old "ekgdboc" parameter is documented as "ekgdboc=kbd earlyprintk=vga". It should be noted that "kbd" has special treatment allowing it to init early as a tty device. Signed-off-by: Douglas Anderson Reviewed-by: Greg Kroah-Hartman Tested-by: Sumit Garg Link: https://lore.kernel.org/r/20200507130644.v4.8.I8fba5961bf452ab92350654aa61957f23ecf0100@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index e2d67b163fb6..4d59aa907fdc 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1073,15 +1073,23 @@ EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); */ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) { + struct kgdb_io *old_dbg_io_ops; int err; spin_lock(&kgdb_registration_lock); - if (dbg_io_ops) { - spin_unlock(&kgdb_registration_lock); + old_dbg_io_ops = dbg_io_ops; + if (old_dbg_io_ops) { + if (!old_dbg_io_ops->deinit) { + spin_unlock(&kgdb_registration_lock); - pr_err("Another I/O driver is already registered with KGDB\n"); - return -EBUSY; + pr_err("KGDB I/O driver %s can't replace %s.\n", + new_dbg_io_ops->name, old_dbg_io_ops->name); + return -EBUSY; + } + pr_info("Replacing I/O driver %s with %s\n", + old_dbg_io_ops->name, new_dbg_io_ops->name); + old_dbg_io_ops->deinit(); } if (new_dbg_io_ops->init) { @@ -1096,6 +1104,9 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops) + return 0; + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ @@ -1132,6 +1143,9 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops->deinit) + old_dbg_io_ops->deinit(); + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } -- cgit v1.2.3 From f83b04d36e52cc3d941120ec859374fcda36eb31 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 16 Apr 2020 10:38:04 +0800 Subject: kgdb: Add kgdb_has_hit_break function The break instruction in RISC-V does not have an immediate value field, so the kernel cannot identify the purpose of each trap exception through the opcode. This makes the existing identification schemes in other architecture unsuitable for the RISC-V kernel. To solve this problem, this patch adds kgdb_has_hit_break(), which can help RISC-V kernel identify the KGDB trap exception. Signed-off-by: Vincent Chen Reviewed-by: Palmer Dabbelt Acked-by: Daniel Thompson Signed-off-by: Palmer Dabbelt --- kernel/debug/debug_core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2b7c9b67931d..01bc3eea3d4d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -417,6 +417,18 @@ int kgdb_isremovedbreak(unsigned long addr) return 0; } +int kgdb_has_hit_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_ACTIVE && + kgdb_break[i].bpt_addr == addr) + return 1; + } + return 0; +} + int dbg_remove_all_break(void) { int error; -- cgit v1.2.3 From 2318976619daf0e868de5b8aff19c1fd8d585867 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 14 May 2020 11:36:41 +0100 Subject: ARM: 8976/1: module: allow arch overrides for .init section names ARM stores unwind information for .init.text in sections named .ARM.extab.init.text and .ARM.exidx.init.text. Since those aren't currently recognized as init sections, they're allocated along with the core section, and relocation fails if the core and the init section are allocated from different regions and can't reach other. final section addresses: ... 0x7f800000 .init.text .. 0xcbb54078 .ARM.exidx.init.text .. section 16 reloc 0 sym '': relocation 42 out of range (0xcbb54078 -> 0x7f800000) Allow architectures to override the section name so that ARM can fix this. Acked-by: Jessica Yu Signed-off-by: Vincent Whitchurch Signed-off-by: Russell King --- kernel/module.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..d29c23d07aff 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2400,7 +2400,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(sname, ".init")) + || module_init_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2433,7 +2433,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(sname, ".init")) + || !module_init_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2768,6 +2768,11 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } +bool __weak module_init_section(const char *name) +{ + return strstarts(name, ".init"); +} + bool __weak module_exit_section(const char *name) { return strstarts(name, ".exit"); -- cgit v1.2.3 From 9d78edeaec759f997c303f286ecd39daee166f2a Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 18 May 2020 20:07:38 +0200 Subject: proc: proc_pid_ns takes super_block as an argument syzbot found that touch /proc/testfile causes NULL pointer dereference at tomoyo_get_local_path() because inode of the dentry is NULL. Before c59f415a7cb6, Tomoyo received pid_ns from proc's s_fs_info directly. Since proc_pid_ns() can only work with inode, using it in the tomoyo_get_local_path() was wrong. To avoid creating more functions for getting proc_ns, change the argument type of the proc_pid_ns() function. Then, Tomoyo can use the existing super_block to get pid_ns. Link: https://lkml.kernel.org/r/0000000000002f0c7505a5b0e04c@google.com Link: https://lkml.kernel.org/r/20200518180738.2939611-1-gladkov.alexey@gmail.com Reported-by: syzbot+c1af344512918c61362c@syzkaller.appspotmail.com Fixes: c59f415a7cb6 ("Use proc_pid_ns() to get pid_namespace from the proc superblock") Signed-off-by: Alexey Gladkov Signed-off-by: Eric W. Biederman --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4385f3d639f2..e7bdaccad942 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1745,7 +1745,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { - ns = proc_pid_ns(file_inode(m->file)); + ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } -- cgit v1.2.3 From 0995a5dfbe49badff78e78761fb66f46579f2f9a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 13:09:50 +0100 Subject: tracing: Provide lockdep less trace_hardirqs_on/off() variants trace_hardirqs_on/off() is only partially safe vs. RCU idle. The tracer core itself is safe, but the resulting tracepoints can be utilized by e.g. BPF which is unsafe. Provide variants which do not contain the lockdep invocation so the lockdep and tracer invocations can be split at the call site and placed properly. This is required because lockdep needs to be aware of the state before switching away from RCU idle and after switching to RCU idle because these transitions can take locks. As these code pathes are going to be non-instrumentable the tracer can be invoked after RCU is turned on and before the switch to RCU idle. So for these new variants there is no need to invoke the rcuidle aware tracer functions. Name them so they match the lockdep counterparts. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.270771162@linutronix.de --- kernel/trace/trace_preemptirq.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 4d8e99fdbbbe..c00880162b06 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -19,6 +19,24 @@ /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); +/* + * Like trace_hardirqs_on() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_on_prepare(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } +} +EXPORT_SYMBOL(trace_hardirqs_on_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); + void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { @@ -33,6 +51,25 @@ void trace_hardirqs_on(void) EXPORT_SYMBOL(trace_hardirqs_on); NOKPROBE_SYMBOL(trace_hardirqs_on); +/* + * Like trace_hardirqs_off() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_off_prepare(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + } + +} +EXPORT_SYMBOL(trace_hardirqs_off_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_off_prepare); + void trace_hardirqs_off(void) { if (!this_cpu_read(tracing_irq_cpu)) { -- cgit v1.2.3 From c86e9b987cea3dd0209203e714553a47f5d7c6dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Mar 2020 14:22:03 +0100 Subject: lockdep: Prepare for noinstr sections Force inlining and prevent instrumentation of all sorts by marking the functions which are invoked from low level entry code with 'noinstr'. Split the irqflags tracking into two parts. One which does the heavy lifting while RCU is watching and the final one which can be invoked after RCU is turned off. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.484532537@linutronix.de --- kernel/locking/lockdep.c | 86 +++++++++++++++++++++++++++++++---------- kernel/trace/trace_preemptirq.c | 2 + 2 files changed, 67 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..9ccd675a8b5a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3635,13 +3635,10 @@ mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) /* * Hardirqs will be enabled: */ -static void __trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(void) { struct task_struct *curr = current; - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: @@ -3654,15 +3651,19 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); } -void lockdep_hardirqs_on(unsigned long ip) +/** + * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts + * @ip: Caller address + * + * Invoked before a possible transition to RCU idle from exit to user or + * guest mode. This ensures that all RCU operations are done before RCU + * stops watching. After the RCU transition lockdep_hardirqs_on() has to be + * invoked to set the final state. + */ +void lockdep_hardirqs_on_prepare(unsigned long ip) { if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3698,20 +3699,62 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; + current->hardirq_chain_key = current->curr_chain_key; + current->lockdep_recursion++; - __trace_hardirqs_on_caller(ip); + __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } -NOKPROBE_SYMBOL(lockdep_hardirqs_on); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); + +void noinstr lockdep_hardirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || curr->lockdep_recursion)) + return; + + if (curr->hardirqs_enabled) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + /* + * Ensure the lock stack remained unchanged between + * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on(). + */ + DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != + current->curr_chain_key); + + /* we'll do an OFF -> ON transition: */ + curr->hardirqs_enabled = 1; + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_on_events); +} +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); /* * Hardirqs were disabled: */ -void lockdep_hardirqs_off(unsigned long ip) +void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks || curr->lockdep_recursion)) return; /* @@ -3729,10 +3772,11 @@ void lockdep_hardirqs_off(unsigned long ip) curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); - } else + } else { debug_atomic_inc(redundant_hardirqs_off); + } } -NOKPROBE_SYMBOL(lockdep_hardirqs_off); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -4408,8 +4452,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, dump_stack(); } -static int match_held_lock(const struct held_lock *hlock, - const struct lockdep_map *lock) +static noinstr int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; @@ -4696,7 +4740,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) return 0; } -static nokprobe_inline +static __always_inline int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; @@ -4956,7 +5000,7 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(const struct lockdep_map *lock, int read) +noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index c00880162b06..fb0691b8a88d 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -46,6 +46,7 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); @@ -93,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); -- cgit v1.2.3 From 8c4e93c362ff114def211d4629b120af86eb1275 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 24 Feb 2020 13:13:31 +0100 Subject: printk: Prepare for nested printk_nmi_enter() There is plenty of space in the printk_context variable. Reserve one byte there for the NMI context to be on the safe side. It should never overflow. The BUG_ON(in_nmi() == NMI_MASK) in nmi_enter() will trigger much earlier. Signed-off-by: Petr Mladek Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.681374113@linutronix.de --- kernel/printk/internal.h | 8 +++++--- kernel/printk/printk_safe.c | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index b2b0f526f249..660f9a6bf73a 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,9 +6,11 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff -#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 -#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff +#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 +#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 + +#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 extern raw_spinlock_t logbuf_lock; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d9a659a686f3..e8791f206417 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -295,12 +295,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) void notrace printk_nmi_enter(void) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } void notrace printk_nmi_exit(void) { - this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); + this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } /* -- cgit v1.2.3 From b0f51883f551b900a04a80f49fb0886caf7e9a12 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:25:03 +0100 Subject: printk: Disallow instrumenting print_nmi_enter() It happens early in nmi_enter(), no tracing, probing or other funnies allowed. Specifically as nmi_enter() will be used in do_debug(), which would cause recursive exceptions when kprobed. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.139720912@linutronix.de --- kernel/printk/printk_safe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index e8791f206417..4242403316bb 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internal.h" @@ -293,12 +294,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) return printk_safe_log_store(s, fmt, args); } -void notrace printk_nmi_enter(void) +void noinstr printk_nmi_enter(void) { this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } -void notrace printk_nmi_exit(void) +void noinstr printk_nmi_exit(void) { this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } -- cgit v1.2.3 From e616cb8daadf637175af4fe53138a94c190c4816 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:14:51 +0100 Subject: lockdep: Always inline lockdep_{off,on}() These functions are called {early,late} in nmi_{enter,exit} and should not be traced or probed. They are also puny, so 'inline' them. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.048523500@linutronix.de --- kernel/locking/lockdep.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..6f1c8cba09c6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,25 +393,6 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -/* - * Split the recrursion counter in two to readily detect 'off' vs recursion. - */ -#define LOCKDEP_RECURSION_BITS 16 -#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) -#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) - -void lockdep_off(void) -{ - current->lockdep_recursion += LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion -= LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_on); - static inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) -- cgit v1.2.3 From 178ba00c354eb15cec6806a812771e60a5ae3ea1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:26:21 +0100 Subject: sh/ftrace: Move arch_ftrace_nmi_{enter,exit} into nmi exception SuperH is the last remaining user of arch_ftrace_nmi_{enter,exit}(), remove it from the generic code and into the SuperH code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Steven Rostedt (VMware) Cc: Rich Felker Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200505134101.248881738@linutronix.de --- kernel/trace/Kconfig | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1e9a8f9a3459..24876faac753 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool -config HAVE_FTRACE_NMI_ENTER - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_FUNCTION_TRACER bool help @@ -72,11 +67,6 @@ config RING_BUFFER select TRACE_CLOCK select IRQ_WORK -config FTRACE_NMI_ENTER - bool - depends on HAVE_FTRACE_NMI_ENTER - default y - config EVENT_TRACING select CONTEXT_SWITCH_TRACER select GLOB -- cgit v1.2.3 From ff5c4f5cad33061b07c3fb9187506783c0f3cb66 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Mar 2020 17:32:17 +0100 Subject: rcu/tree: Mark the idle relevant functions noinstr These functions are invoked from context tracking and other places in the low level entry code. Move them into the .noinstr.text section to exclude them from instrumentation. Mark the places which are safe to invoke traceable functions with instrumentation_begin/end() so objtool won't complain. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: https://lkml.kernel.org/r/20200505134100.575356107@linutronix.de --- kernel/rcu/tree.c | 83 +++++++++++++++++++++++++++--------------------- kernel/rcu/tree_plugin.h | 4 +-- kernel/rcu/update.c | 3 +- 3 files changed, 49 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f288477ee1c2..0713ef3fa560 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -88,9 +88,6 @@ */ #define RCU_DYNTICK_CTRL_MASK 0x1 #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) -#ifndef rcu_eqs_special_exit -#define rcu_eqs_special_exit() do { } while (0) -#endif static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, @@ -242,7 +239,7 @@ void rcu_softirq_qs(void) * RCU is watching prior to the call to this function and is no longer * watching upon return. */ -static void rcu_dynticks_eqs_enter(void) +static noinstr void rcu_dynticks_eqs_enter(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -267,7 +264,7 @@ static void rcu_dynticks_eqs_enter(void) * called from an extended quiescent state, that is, RCU is not watching * prior to the call to this function and is watching upon return. */ -static void rcu_dynticks_eqs_exit(void) +static noinstr void rcu_dynticks_eqs_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -285,8 +282,6 @@ static void rcu_dynticks_eqs_exit(void) if (seq & RCU_DYNTICK_CTRL_MASK) { atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ - /* Prefer duplicate flushes to losing a flush. */ - rcu_eqs_special_exit(); } } @@ -314,7 +309,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -static bool rcu_dynticks_curr_cpu_in_eqs(void) +static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -603,7 +598,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter(bool user) +static noinstr void rcu_eqs_enter(bool user) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -618,12 +613,14 @@ static void rcu_eqs_enter(bool user) } lockdep_assert_irqs_disabled(); + instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... rcu_dynticks_eqs_enter(); @@ -660,7 +657,7 @@ void rcu_idle_enter(void) * If you add or remove a call to rcu_user_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_enter(void) +noinstr void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); @@ -693,19 +690,23 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { + instrumentation_begin(); trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); + instrumentation_end(); return; } + instrumentation_begin(); /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (irq) rcu_prepare_for_idle(); + instrumentation_end(); // RCU is watching here ... rcu_dynticks_eqs_enter(); @@ -721,7 +722,7 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_exit(void) +void noinstr rcu_nmi_exit(void) { rcu_nmi_exit_common(false); } @@ -745,7 +746,7 @@ void rcu_nmi_exit(void) * If you add or remove a call to rcu_irq_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_exit(void) +void noinstr rcu_irq_exit(void) { lockdep_assert_irqs_disabled(); rcu_nmi_exit_common(true); @@ -774,7 +775,7 @@ void rcu_irq_exit_irqson(void) * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ -static void rcu_eqs_exit(bool user) +static void noinstr rcu_eqs_exit(bool user) { struct rcu_data *rdp; long oldval; @@ -792,12 +793,14 @@ static void rcu_eqs_exit(bool user) // RCU is not watching here ... rcu_dynticks_eqs_exit(); // ... but is watching here. + instrumentation_begin(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + instrumentation_end(); } /** @@ -828,7 +831,7 @@ void rcu_idle_exit(void) * If you add or remove a call to rcu_user_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_exit(void) +void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } @@ -876,28 +879,35 @@ static __always_inline void rcu_nmi_enter_common(bool irq) rcu_cleanup_after_idle(); incby = 1; - } else if (irq && tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - // We get here only if we had already exited the extended - // quiescent state and this was an interrupt (not an NMI). - // Therefore, (1) RCU is already watching and (2) The fact - // that we are in an interrupt handler and that the rcu_node - // lock is an irq-disabled lock prevents self-deadlock. - // So we can safely recheck under the lock. - raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - // A nohz_full CPU is in the kernel and RCU - // needs a quiescent state. Turn on the tick! - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } else if (irq) { + instrumentation_begin(); + if (tick_nohz_full_cpu(rdp->cpu) && + rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && + READ_ONCE(rdp->rcu_urgent_qs) && + !READ_ONCE(rdp->rcu_forced_tick)) { + // We get here only if we had already exited the + // extended quiescent state and this was an + // interrupt (not an NMI). Therefore, (1) RCU is + // already watching and (2) The fact that we are in + // an interrupt handler and that the rcu_node lock + // is an irq-disabled lock prevents self-deadlock. + // So we can safely recheck under the lock. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU + // needs a quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } - raw_spin_unlock_rcu_node(rdp->mynode); + instrumentation_end(); } + instrumentation_begin(); trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); @@ -906,11 +916,10 @@ static __always_inline void rcu_nmi_enter_common(bool irq) /** * rcu_nmi_enter - inform RCU of entry to NMI context */ -void rcu_nmi_enter(void) +noinstr void rcu_nmi_enter(void) { rcu_nmi_enter_common(false); } -NOKPROBE_SYMBOL(rcu_nmi_enter); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -934,7 +943,7 @@ NOKPROBE_SYMBOL(rcu_nmi_enter); * If you add or remove a call to rcu_irq_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_enter(void) +noinstr void rcu_irq_enter(void) { lockdep_assert_irqs_disabled(); rcu_nmi_enter_common(true); @@ -979,7 +988,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) * if the current CPU is not in its idle loop or is in an interrupt or * NMI handler, return true. */ -bool notrace rcu_is_watching(void) +bool rcu_is_watching(void) { bool ret; @@ -1031,12 +1040,12 @@ bool rcu_lockdep_current_cpu_online(void) if (in_nmi() || !rcu_scheduler_fully_active) return true; - preempt_disable(); + preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ret = true; - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 50caa3fcbad2..352223664ebd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2539,7 +2539,7 @@ static void rcu_bind_gp_kthread(void) } /* Record the current task on dyntick-idle entry. */ -static void rcu_dynticks_task_enter(void) +static void noinstr rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); @@ -2547,7 +2547,7 @@ static void rcu_dynticks_task_enter(void) } /* Record no current task on dyntick-idle exit. */ -static void rcu_dynticks_task_exit(void) +static void noinstr rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3ce63a91d956..84843adfd939 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -284,13 +284,12 @@ struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); EXPORT_SYMBOL_GPL(rcu_callback_map); -int notrace debug_lockdep_rcu_enabled(void) +noinstr int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); -NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? -- cgit v1.2.3 From 9ea366f669ded353ae49754216c042e7d2f72ba6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Feb 2020 12:31:16 -0800 Subject: rcu: Make RCU IRQ enter/exit functions rely on in_nmi() The rcu_nmi_enter_common() and rcu_nmi_exit_common() functions take an "irq" parameter that indicates whether these functions have been invoked from an irq handler (irq==true) or an NMI handler (irq==false). However, recent changes have applied notrace to a few critical functions such that rcu_nmi_enter_common() and rcu_nmi_exit_common() many now rely on in_nmi(). Note that in_nmi() works no differently than before, but rather that tracing is now prohibited in code regions where in_nmi() would incorrectly report NMI state. Therefore remove the "irq" parameter and inline rcu_nmi_enter_common() and rcu_nmi_exit_common() into rcu_nmi_enter() and rcu_nmi_exit(), respectively. Signed-off-by: Paul E. McKenney Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.617130349@linutronix.de --- kernel/rcu/tree.c | 47 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0713ef3fa560..945401674b9d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -664,16 +664,18 @@ noinstr void rcu_user_enter(void) } #endif /* CONFIG_NO_HZ_FULL */ -/* +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * * If we are returning from the outermost NMI handler that interrupted an * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * - * If you add or remove a call to rcu_nmi_exit_common(), be sure to test + * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_exit_common(bool irq) +noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -704,7 +706,7 @@ static __always_inline void rcu_nmi_exit_common(bool irq) trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (irq) + if (!in_nmi()) rcu_prepare_for_idle(); instrumentation_end(); @@ -712,21 +714,10 @@ static __always_inline void rcu_nmi_exit_common(bool irq) rcu_dynticks_eqs_enter(); // ... but is no longer watching here. - if (irq) + if (!in_nmi()) rcu_dynticks_task_enter(); } -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_nmi_exit(void) -{ - rcu_nmi_exit_common(false); -} - /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -749,7 +740,7 @@ void noinstr rcu_nmi_exit(void) void noinstr rcu_irq_exit(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_exit_common(true); + rcu_nmi_exit(); } /* @@ -838,7 +829,7 @@ void noinstr rcu_user_exit(void) #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_nmi_enter_common - inform RCU of entry to NMI context + * rcu_nmi_enter - inform RCU of entry to NMI context * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and @@ -847,10 +838,10 @@ void noinstr rcu_user_exit(void) * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) * - * If you add or remove a call to rcu_nmi_enter_common(), be sure to test + * If you add or remove a call to rcu_nmi_enter(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_enter_common(bool irq) +noinstr void rcu_nmi_enter(void) { long incby = 2; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -868,18 +859,18 @@ static __always_inline void rcu_nmi_enter_common(bool irq) */ if (rcu_dynticks_curr_cpu_in_eqs()) { - if (irq) + if (!in_nmi()) rcu_dynticks_task_exit(); // RCU is not watching here ... rcu_dynticks_eqs_exit(); // ... but is watching here. - if (irq) + if (!in_nmi()) rcu_cleanup_after_idle(); incby = 1; - } else if (irq) { + } else if (!in_nmi()) { instrumentation_begin(); if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && @@ -913,14 +904,6 @@ static __always_inline void rcu_nmi_enter_common(bool irq) barrier(); } -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - */ -noinstr void rcu_nmi_enter(void) -{ - rcu_nmi_enter_common(false); -} - /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * @@ -946,7 +929,7 @@ noinstr void rcu_nmi_enter(void) noinstr void rcu_irq_enter(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_enter_common(true); + rcu_nmi_enter(); } /* -- cgit v1.2.3 From 8ae0ae6737ad449c8ae21e2bb01d9736f360a933 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 3 May 2020 15:08:52 +0200 Subject: rcu: Provide rcu_irq_exit_preempt() Interrupts and exceptions invoke rcu_irq_enter() on entry and need to invoke rcu_irq_exit() before they either return to the interrupted code or invoke the scheduler due to preemption. The general assumption is that RCU idle code has to have preemption disabled so that a return from interrupt cannot schedule. So the return from interrupt code invokes rcu_irq_exit() and preempt_schedule_irq(). If there is any imbalance in the rcu_irq/nmi* invocations or RCU idle code had preemption enabled then this goes unnoticed until the CPU goes idle or some other RCU check is executed. Provide rcu_irq_exit_preempt() which can be invoked from the interrupt/exception return code in case that preemption is enabled. It invokes rcu_irq_exit() and contains a few sanity checks in case that CONFIG_PROVE_RCU is enabled to catch such issues directly. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.364456424@linutronix.de --- kernel/rcu/tree.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 945401674b9d..62ee01299386 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -743,6 +743,28 @@ void noinstr rcu_irq_exit(void) rcu_nmi_exit(); } +/** + * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq + * towards in kernel preemption + * + * Same as rcu_irq_exit() but has a sanity check that scheduling is safe + * from RCU point of view. Invoked from return from interrupt before kernel + * preemption. + */ +void rcu_irq_exit_preempt(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nmi_exit(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * -- cgit v1.2.3 From b1fcf9b83c4149c63d1e0c699e85f93cbe28e211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 May 2020 09:44:43 +0200 Subject: rcu: Provide __rcu_is_watching() Same as rcu_is_watching() but without the preempt_disable/enable() pair inside the function. It is merked noinstr so it ends up in the non-instrumentable text section. This is useful for non-preemptible code especially in the low level entry section. Using rcu_is_watching() there results in a call to the preempt_schedule_notrace() thunk which triggers noinstr section warnings in objtool. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200512213810.518709291@linutronix.de --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 62ee01299386..90c8be22d57a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -985,6 +985,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } +noinstr bool __rcu_is_watching(void) +{ + return !rcu_dynticks_curr_cpu_in_eqs(); +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * -- cgit v1.2.3 From 66e9b0717102507e64f638790eaece88765cc9e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2020 14:04:34 +0100 Subject: kprobes: Prevent probes in .noinstr.text section Instrumentation is forbidden in the .noinstr.text section. Make kprobes respect this. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200505134100.179862032@linutronix.de --- kernel/kprobes.c | 18 ++++++++++++++++++ kernel/module.c | 3 +++ 2 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9eb5acf0a9f3..3f310df4a693 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2229,6 +2229,12 @@ static int __init populate_kprobe_blacklist(unsigned long *start, /* Symbols in __kprobes_text are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); + if (ret) + return ret; + + /* Symbols in noinstr section are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, + (unsigned long)__noinstr_text_end); return ret ? : arch_populate_kprobe_blacklist(); } @@ -2248,6 +2254,12 @@ static void add_module_kprobe_blacklist(struct module *mod) end = start + mod->kprobes_text_size; kprobe_add_area_blacklist(start, end); } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_add_area_blacklist(start, end); + } } static void remove_module_kprobe_blacklist(struct module *mod) @@ -2265,6 +2277,12 @@ static void remove_module_kprobe_blacklist(struct module *mod) end = start + mod->kprobes_text_size; kprobe_remove_area_blacklist(start, end); } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_remove_area_blacklist(start, end); + } } /* Module notifier call back, checking kprobes on the module */ diff --git a/kernel/module.c b/kernel/module.c index faf733789560..72ed2b3a6ee2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3150,6 +3150,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) } #endif + mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, + &mod->noinstr_text_size); + #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), -- cgit v1.2.3 From c73be61cede5882f9605a852414db559c0ebedfd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: pipe: Add general notification queue support Make it possible to have a general notification queue built on top of a standard pipe. Notifications are 'spliced' into the pipe and then read out. splice(), vmsplice() and sendfile() are forbidden on pipes used for notifications as post_one_notification() cannot take pipe->mutex. This means that notifications could be posted in between individual pipe buffers, making iov_iter_revert() difficult to effect. The way the notification queue is used is: (1) An application opens a pipe with a special flag and indicates the number of messages it wishes to be able to queue at once (this can only be set once): pipe2(fds, O_NOTIFICATION_PIPE); ioctl(fds[0], IOC_WATCH_QUEUE_SET_SIZE, queue_depth); (2) The application then uses poll() and read() as normal to extract data from the pipe. read() will return multiple notifications if the buffer is big enough, but it will not split a notification across buffers - rather it will return a short read or EMSGSIZE. Notification messages include a length in the header so that the caller can split them up. Each message has a header that describes it: struct watch_notification { __u32 type:24; __u32 subtype:8; __u32 info; }; The type indicates the source (eg. mount tree changes, superblock events, keyring changes, block layer events) and the subtype indicates the event type (eg. mount, unmount; EIO, EDQUOT; link, unlink). The info field indicates a number of things, including the entry length, an ID assigned to a watchpoint contributing to this buffer and type-specific flags. Supplementary data, such as the key ID that generated an event, can be attached in additional slots. The maximum message size is 127 bytes. Messages may not be padded or aligned, so there is no guarantee, for example, that the notification type will be on a 4-byte bounary. Signed-off-by: David Howells --- kernel/Makefile | 1 + kernel/watch_queue.c | 657 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 658 insertions(+) create mode 100644 kernel/watch_queue.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..41e7e3ae07ec 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,6 +115,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c new file mode 100644 index 000000000000..c103e34f8705 --- /dev/null +++ b/kernel/watch_queue.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Watch queue and general notification mechanism, built on pipes + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#define pr_fmt(fmt) "watchq: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_DESCRIPTION("Watch queue"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +#define WATCH_QUEUE_NOTE_SIZE 128 +#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) + +static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct watch_queue *wqueue = (struct watch_queue *)buf->private; + struct page *page; + unsigned int bit; + + /* We need to work out which note within the page this refers to, but + * the note might have been maximum size, so merely ANDing the offset + * off doesn't work. OTOH, the note must've been more than zero size. + */ + bit = buf->offset + buf->len; + if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) + bit -= WATCH_QUEUE_NOTE_SIZE; + bit /= WATCH_QUEUE_NOTE_SIZE; + + page = buf->page; + bit += page->index; + + set_bit(bit, wqueue->notes_bitmap); +} + +static int watch_queue_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return -1; /* No. */ +} + +/* New data written to a pipe may be appended to a buffer with this type. */ +static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { + .confirm = generic_pipe_buf_confirm, + .release = watch_queue_pipe_buf_release, + .steal = watch_queue_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +/* + * Post a notification to a watch queue. + */ +static bool post_one_notification(struct watch_queue *wqueue, + struct watch_notification *n) +{ + void *p; + struct pipe_inode_info *pipe = wqueue->pipe; + struct pipe_buffer *buf; + struct page *page; + unsigned int head, tail, mask, note, offset, len; + bool done = false; + + if (!pipe) + return false; + + spin_lock_irq(&pipe->rd_wait.lock); + + if (wqueue->defunct) + goto out; + + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + if (pipe_full(head, tail, pipe->ring_size)) + goto lost; + + note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); + if (note >= wqueue->nr_notes) + goto lost; + + page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; + offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; + get_page(page); + len = n->info & WATCH_INFO_LENGTH; + p = kmap_atomic(page); + memcpy(p + offset, n, len); + kunmap_atomic(p); + + buf = &pipe->bufs[head & mask]; + buf->page = page; + buf->private = (unsigned long)wqueue; + buf->ops = &watch_queue_pipe_buf_ops; + buf->offset = offset; + buf->len = len; + buf->flags = 0; + pipe->head = head + 1; + + if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { + spin_unlock_irq(&pipe->rd_wait.lock); + BUG(); + } + wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + done = true; + +out: + spin_unlock_irq(&pipe->rd_wait.lock); + if (done) + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + return done; + +lost: + goto out; +} + +/* + * Apply filter rules to a notification. + */ +static bool filter_watch_notification(const struct watch_filter *wf, + const struct watch_notification *n) +{ + const struct watch_type_filter *wt; + unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; + unsigned int st_index = n->subtype / st_bits; + unsigned int st_bit = 1U << (n->subtype % st_bits); + int i; + + if (!test_bit(n->type, wf->type_filter)) + return false; + + for (i = 0; i < wf->nr_filters; i++) { + wt = &wf->filters[i]; + if (n->type == wt->type && + (wt->subtype_filter[st_index] & st_bit) && + (n->info & wt->info_mask) == wt->info_filter) + return true; + } + + return false; /* If there is a filter, the default is to reject. */ +} + +/** + * __post_watch_notification - Post an event notification + * @wlist: The watch list to post the event to. + * @n: The notification record to post. + * @cred: The creds of the process that triggered the notification. + * @id: The ID to match on the watch. + * + * Post a notification of an event into a set of watch queues and let the users + * know. + * + * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and + * should be in units of sizeof(*n). + */ +void __post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + const struct watch_filter *wf; + struct watch_queue *wqueue; + struct watch *watch; + + if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { + WARN_ON(1); + return; + } + + rcu_read_lock(); + + hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { + if (watch->id != id) + continue; + n->info &= ~WATCH_INFO_ID; + n->info |= watch->info_id; + + wqueue = rcu_dereference(watch->queue); + wf = rcu_dereference(wqueue->filter); + if (wf && !filter_watch_notification(wf, n)) + continue; + + if (security_post_notification(watch->cred, cred, n) < 0) + continue; + + post_one_notification(wqueue, n); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL(__post_watch_notification); + +/* + * Allocate sufficient pages to preallocation for the requested number of + * notifications. + */ +long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) +{ + struct watch_queue *wqueue = pipe->watch_queue; + struct page **pages; + unsigned long *bitmap; + unsigned long user_bufs; + unsigned int bmsize; + int ret, i, nr_pages; + + if (!wqueue) + return -ENODEV; + if (wqueue->notes) + return -EBUSY; + + if (nr_notes < 1 || + nr_notes > 512) /* TODO: choose a better hard limit */ + return -EINVAL; + + nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); + nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); + + if (nr_pages > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto error; + } + + ret = pipe_resize_ring(pipe, nr_notes); + if (ret < 0) + goto error; + + pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + if (!pages) + goto error; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto error_p; + pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + } + + bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; + bmsize *= sizeof(unsigned long); + bitmap = kmalloc(bmsize, GFP_KERNEL); + if (!bitmap) + goto error_p; + + memset(bitmap, 0xff, bmsize); + wqueue->notes = pages; + wqueue->notes_bitmap = bitmap; + wqueue->nr_pages = nr_pages; + wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + return 0; + +error_p: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kfree(pages); +error: + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); + return ret; +} + +/* + * Set the filter on a watch queue. + */ +long watch_queue_set_filter(struct pipe_inode_info *pipe, + struct watch_notification_filter __user *_filter) +{ + struct watch_notification_type_filter *tf; + struct watch_notification_filter filter; + struct watch_type_filter *q; + struct watch_filter *wfilter; + struct watch_queue *wqueue = pipe->watch_queue; + int ret, nr_filter = 0, i; + + if (!wqueue) + return -ENODEV; + + if (!_filter) { + /* Remove the old filter */ + wfilter = NULL; + goto set; + } + + /* Grab the user's filter specification */ + if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) + return -EFAULT; + if (filter.nr_filters == 0 || + filter.nr_filters > 16 || + filter.__reserved != 0) + return -EINVAL; + + tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); + if (IS_ERR(tf)) + return PTR_ERR(tf); + + ret = -EINVAL; + for (i = 0; i < filter.nr_filters; i++) { + if ((tf[i].info_filter & ~tf[i].info_mask) || + tf[i].info_mask & WATCH_INFO_LENGTH) + goto err_filter; + /* Ignore any unknown types */ + if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + continue; + nr_filter++; + } + + /* Now we need to build the internal filter from only the relevant + * user-specified filters. + */ + ret = -ENOMEM; + wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); + if (!wfilter) + goto err_filter; + wfilter->nr_filters = nr_filter; + + q = wfilter->filters; + for (i = 0; i < filter.nr_filters; i++) { + if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + continue; + + q->type = tf[i].type; + q->info_filter = tf[i].info_filter; + q->info_mask = tf[i].info_mask; + q->subtype_filter[0] = tf[i].subtype_filter[0]; + __set_bit(q->type, wfilter->type_filter); + q++; + } + + kfree(tf); +set: + pipe_lock(pipe); + wfilter = rcu_replace_pointer(wqueue->filter, wfilter, + lockdep_is_held(&pipe->mutex)); + pipe_unlock(pipe); + if (wfilter) + kfree_rcu(wfilter, rcu); + return 0; + +err_filter: + kfree(tf); + return ret; +} + +static void __put_watch_queue(struct kref *kref) +{ + struct watch_queue *wqueue = + container_of(kref, struct watch_queue, usage); + struct watch_filter *wfilter; + int i; + + for (i = 0; i < wqueue->nr_pages; i++) + __free_page(wqueue->notes[i]); + + wfilter = rcu_access_pointer(wqueue->filter); + if (wfilter) + kfree_rcu(wfilter, rcu); + kfree_rcu(wqueue, rcu); +} + +/** + * put_watch_queue - Dispose of a ref on a watchqueue. + * @wqueue: The watch queue to unref. + */ +void put_watch_queue(struct watch_queue *wqueue) +{ + kref_put(&wqueue->usage, __put_watch_queue); +} +EXPORT_SYMBOL(put_watch_queue); + +static void free_watch(struct rcu_head *rcu) +{ + struct watch *watch = container_of(rcu, struct watch, rcu); + + put_watch_queue(rcu_access_pointer(watch->queue)); + put_cred(watch->cred); +} + +static void __put_watch(struct kref *kref) +{ + struct watch *watch = container_of(kref, struct watch, usage); + + call_rcu(&watch->rcu, free_watch); +} + +/* + * Discard a watch. + */ +static void put_watch(struct watch *watch) +{ + kref_put(&watch->usage, __put_watch); +} + +/** + * init_watch_queue - Initialise a watch + * @watch: The watch to initialise. + * @wqueue: The queue to assign. + * + * Initialise a watch and set the watch queue. + */ +void init_watch(struct watch *watch, struct watch_queue *wqueue) +{ + kref_init(&watch->usage); + INIT_HLIST_NODE(&watch->list_node); + INIT_HLIST_NODE(&watch->queue_node); + rcu_assign_pointer(watch->queue, wqueue); +} + +/** + * add_watch_to_object - Add a watch on an object to a watch list + * @watch: The watch to add + * @wlist: The watch list to add to + * + * @watch->queue must have been set to point to the queue to post notifications + * to and the watch list of the object to be watched. @watch->cred must also + * have been set to the appropriate credentials and a ref taken on them. + * + * The caller must pin the queue and the list both and must hold the list + * locked against racing watch additions/removals. + */ +int add_watch_to_object(struct watch *watch, struct watch_list *wlist) +{ + struct watch_queue *wqueue = rcu_access_pointer(watch->queue); + struct watch *w; + + hlist_for_each_entry(w, &wlist->watchers, list_node) { + struct watch_queue *wq = rcu_access_pointer(w->queue); + if (wqueue == wq && watch->id == w->id) + return -EBUSY; + } + + watch->cred = get_current_cred(); + rcu_assign_pointer(watch->watch_list, wlist); + + spin_lock_bh(&wqueue->lock); + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + spin_unlock_bh(&wqueue->lock); + + hlist_add_head(&watch->list_node, &wlist->watchers); + return 0; +} +EXPORT_SYMBOL(add_watch_to_object); + +/** + * remove_watch_from_object - Remove a watch or all watches from an object. + * @wlist: The watch list to remove from + * @wq: The watch queue of interest (ignored if @all is true) + * @id: The ID of the watch to remove (ignored if @all is true) + * @all: True to remove all objects + * + * Remove a specific watch or all watches from an object. A notification is + * sent to the watcher to tell them that this happened. + */ +int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, + u64 id, bool all) +{ + struct watch_notification_removal n; + struct watch_queue *wqueue; + struct watch *watch; + int ret = -EBADSLT; + + rcu_read_lock(); + +again: + spin_lock(&wlist->lock); + hlist_for_each_entry(watch, &wlist->watchers, list_node) { + if (all || + (watch->id == id && rcu_access_pointer(watch->queue) == wq)) + goto found; + } + spin_unlock(&wlist->lock); + goto out; + +found: + ret = 0; + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + spin_unlock(&wlist->lock); + + /* We now own the reference on watch that used to belong to wlist. */ + + n.watch.type = WATCH_TYPE_META; + n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; + n.watch.info = watch->info_id | watch_sizeof(n.watch); + n.id = id; + if (id != 0) + n.watch.info = watch->info_id | watch_sizeof(n); + + wqueue = rcu_dereference(watch->queue); + + /* We don't need the watch list lock for the next bit as RCU is + * protecting *wqueue from deallocation. + */ + if (wqueue) { + post_one_notification(wqueue, &n.watch); + + spin_lock_bh(&wqueue->lock); + + if (!hlist_unhashed(&watch->queue_node)) { + hlist_del_init_rcu(&watch->queue_node); + put_watch(watch); + } + + spin_unlock_bh(&wqueue->lock); + } + + if (wlist->release_watch) { + void (*release_watch)(struct watch *); + + release_watch = wlist->release_watch; + rcu_read_unlock(); + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + + if (all && !hlist_empty(&wlist->watchers)) + goto again; +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(remove_watch_from_object); + +/* + * Remove all the watches that are contributory to a queue. This has the + * potential to race with removal of the watches by the destruction of the + * objects being watched or with the distribution of notifications. + */ +void watch_queue_clear(struct watch_queue *wqueue) +{ + struct watch_list *wlist; + struct watch *watch; + bool release; + + rcu_read_lock(); + spin_lock_bh(&wqueue->lock); + + /* Prevent new additions and prevent notifications from happening */ + wqueue->defunct = true; + + while (!hlist_empty(&wqueue->watches)) { + watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); + hlist_del_init_rcu(&watch->queue_node); + /* We now own a ref on the watch. */ + spin_unlock_bh(&wqueue->lock); + + /* We can't do the next bit under the queue lock as we need to + * get the list lock - which would cause a deadlock if someone + * was removing from the opposite direction at the same time or + * posting a notification. + */ + wlist = rcu_dereference(watch->watch_list); + if (wlist) { + void (*release_watch)(struct watch *); + + spin_lock(&wlist->lock); + + release = !hlist_unhashed(&watch->list_node); + if (release) { + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + + /* We now own a second ref on the watch. */ + } + + release_watch = wlist->release_watch; + spin_unlock(&wlist->lock); + + if (release) { + if (release_watch) { + rcu_read_unlock(); + /* This might need to call dput(), so + * we have to drop all the locks. + */ + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + } + } + + put_watch(watch); + spin_lock_bh(&wqueue->lock); + } + + spin_unlock_bh(&wqueue->lock); + rcu_read_unlock(); +} + +/** + * get_watch_queue - Get a watch queue from its file descriptor. + * @fd: The fd to query. + */ +struct watch_queue *get_watch_queue(int fd) +{ + struct pipe_inode_info *pipe; + struct watch_queue *wqueue = ERR_PTR(-EINVAL); + struct fd f; + + f = fdget(fd); + if (f.file) { + pipe = get_pipe_info(f.file, false); + if (pipe && pipe->watch_queue) { + wqueue = pipe->watch_queue; + kref_get(&wqueue->usage); + } + fdput(f); + } + + return wqueue; +} +EXPORT_SYMBOL(get_watch_queue); + +/* + * Initialise a watch queue + */ +int watch_queue_init(struct pipe_inode_info *pipe) +{ + struct watch_queue *wqueue; + + wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); + if (!wqueue) + return -ENOMEM; + + wqueue->pipe = pipe; + kref_init(&wqueue->usage); + spin_lock_init(&wqueue->lock); + INIT_HLIST_HEAD(&wqueue->watches); + + pipe->watch_queue = wqueue; + return 0; +} -- cgit v1.2.3 From 8cfba76383e902acbed95092163052b1572f17a8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: pipe: Allow buffers to be marked read-whole-or-error for notifications Allow a buffer to be marked such that read() must return the entire buffer in one go or return ENOBUFS. Multiple buffers can be amalgamated into a single read, but a short read will occur if the next "whole" buffer won't fit. This is useful for watch queue notifications to make sure we don't split a notification across multiple reads, especially given that we need to fabricate an overrun record under some circumstances - and that isn't in the buffers. Signed-off-by: David Howells --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index c103e34f8705..ad64ea300f6d 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -115,7 +115,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; - buf->flags = 0; + buf->flags = PIPE_BUF_FLAG_WHOLE; pipe->head = head + 1; if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { -- cgit v1.2.3 From e7d553d69cf63aec7de0f38fed49ccbb30922e1e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:12 +0000 Subject: pipe: Add notification lossage handling Add handling for loss of notifications by having read() insert a loss-notification message after it has read the pipe buffer that was last in the ring when the loss occurred. Lossage can come about either by running out of notification descriptors or by running out of space in the pipe ring. Signed-off-by: David Howells --- kernel/watch_queue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index ad64ea300f6d..9a9699c06709 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -132,6 +132,8 @@ out: return done; lost: + buf = &pipe->bufs[(head - 1) & mask]; + buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } -- cgit v1.2.3 From ab7e9b067f3d9cbec28cfca51d341efb421b7a51 Mon Sep 17 00:00:00 2001 From: Domenico Andreoli Date: Thu, 7 May 2020 09:19:52 +0200 Subject: PM: hibernate: Incorporate concurrency handling Hibernation concurrency handling is currently delegated to user.c, where it's also used for regulating the access to the snapshot device. In the prospective of making user.c a separate configuration option, such mutual exclusion is brought into hibernate.c and made available through accessor helpers hereby introduced. Signed-off-by: Domenico Andreoli Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 20 ++++++++++++++++---- kernel/power/power.h | 4 ++-- kernel/power/user.c | 10 ++++------ 3 files changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30bd28d1d418..02ec716a4927 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -67,6 +67,18 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +static atomic_t hibernate_atomic = ATOMIC_INIT(1); + +bool hibernate_acquire(void) +{ + return atomic_add_unless(&hibernate_atomic, -1, 0); +} + +void hibernate_release(void) +{ + atomic_inc(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); @@ -704,7 +716,7 @@ int hibernate(void) lock_system_sleep(); /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } @@ -775,7 +787,7 @@ int hibernate(void) Exit: __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); - atomic_inc(&snapshot_device_available); + hibernate_release(); Unlock: unlock_system_sleep(); pr_info("hibernation exit\n"); @@ -880,7 +892,7 @@ static int software_resume(void) goto Unlock; /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; swsusp_close(FMODE_READ); goto Unlock; @@ -911,7 +923,7 @@ static int software_resume(void) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); pr_info("resume failed (%d)\n", error); - atomic_inc(&snapshot_device_available); + hibernate_release(); /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&system_transition_mutex); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7cdc64dc2373..ba2094db6294 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -154,8 +154,8 @@ extern int snapshot_write_next(struct snapshot_handle *handle); extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -/* If unset, the snapshot device cannot be open. */ -extern atomic_t snapshot_device_available; +extern bool hibernate_acquire(void); +extern void hibernate_release(void); extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); diff --git a/kernel/power/user.c b/kernel/power/user.c index 7959449765d9..98548d1cf8a6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -37,8 +37,6 @@ static struct snapshot_data { bool free_bitmaps; } snapshot_state; -atomic_t snapshot_device_available = ATOMIC_INIT(1); - static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; @@ -49,13 +47,13 @@ static int snapshot_open(struct inode *inode, struct file *filp) lock_system_sleep(); - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); + hibernate_release(); error = -ENOSYS; goto Unlock; } @@ -92,7 +90,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) - atomic_inc(&snapshot_device_available); + hibernate_release(); data->frozen = false; data->ready = false; @@ -122,7 +120,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); - atomic_inc(&snapshot_device_available); + hibernate_release(); unlock_system_sleep(); -- cgit v1.2.3 From c4f39a6c74389fcc93ac39056ef342f32ab57a23 Mon Sep 17 00:00:00 2001 From: Domenico Andreoli Date: Thu, 7 May 2020 09:19:53 +0200 Subject: PM: hibernate: Split off snapshot dev option Make it possible to reduce the attack surface in case the snapshot device is not to be used from userspace. Signed-off-by: Domenico Andreoli Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 12 ++++++++++++ kernel/power/Makefile | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index c208566c844b..4d0e6e815a2b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -80,6 +80,18 @@ config HIBERNATION For more information take a look at . +config HIBERNATION_SNAPSHOT_DEV + bool "Userspace snapshot device" + depends on HIBERNATION + default y + ---help--- + Device used by the uswsusp tools. + + Say N if no snapshotting from userspace is needed, this also + reduces the attack surface of the kernel. + + If in doubt, say Y. + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/Makefile b/kernel/power/Makefile index e7e47d9be1e5..5899260a8bef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o +obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o -- cgit v1.2.3 From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:45 +0200 Subject: bpf: Add get{peer, sock}name attach types for sock_addr As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be transparent to applications. In Cilium we make use of these hooks [0] in order to enable E-W load balancing for existing Kubernetes service types for all Cilium managed nodes in the cluster. Those backends can be local or remote. The main advantage of this approach is that it operates as close as possible to the socket, and therefore allows to avoid packet-based NAT given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses. This also allows to expose NodePort services on loopback addresses in the host namespace, for example. As another advantage, this also efficiently blocks bind requests for applications in the host namespace for exposed ports. However, one missing item is that we also need to perform reverse xlation for inet{,6}_getname() hooks such that we can return the service IP/port tuple back to the application instead of the remote peer address. The vast majority of applications does not bother about getpeername(), but in a few occasions we've seen breakage when validating the peer's address since it returns unexpectedly the backend tuple instead of the service one. Therefore, this trivial patch allows to customise and adds a getpeername() as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order to address this situation. Simple example: # ./cilium/cilium service list ID Frontend Service Type Backend 1 1.2.3.4:80 ClusterIP 1 => 10.0.0.10:80 Before; curl's verbose output example, no getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] After; with getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed peer to the context similar as in inet{,6}_getname() fashion, but API-wise this is suboptimal as it always enforces programs having to test for ctx->peer which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split. Similarly, the checked return code is on tnum_range(1, 1), but if a use case comes up in future, it can easily be changed to return an error code instead. Helper and ctx member access is the same as with connect/sendmsg/etc hooks. [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net --- kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 6 +++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c7d67d65d8c..2ed8351f47a4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: -- cgit v1.2.3 From 7d148be69e3a0eaa9d029a3c51b545e322116a99 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 13 May 2020 15:55:02 +0200 Subject: sched/fair: Optimize enqueue_task_fair() enqueue_task_fair jumps to enqueue_throttle label when cfs_rq_of(se) is throttled which means that se can't be NULL in such case and we can move the label after the if (!se) statement. Futhermore, the latter can be removed because se is always NULL when reaching this point. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200513135502.4672-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a58874ef104..4e586863827b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5512,28 +5512,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) list_add_leaf_cfs_rq(cfs_rq); } -enqueue_throttle: - if (!se) { - add_nr_running(rq, 1); - /* - * Since new tasks are assigned an initial util_avg equal to - * half of the spare capacity of their CPU, tiny tasks have the - * ability to cross the overutilized threshold, which will - * result in the load balancer ruining all the task placement - * done by EAS. As a way to mitigate that effect, do not account - * for the first enqueue operation of new tasks during the - * overutilized flag detection. - * - * A better way of solving this problem would be to wait for - * the PELT signals of tasks to converge before taking them - * into account, but that is not straightforward to implement, - * and the following generally works well enough in practice. - */ - if (flags & ENQUEUE_WAKEUP) - update_overutilized_status(rq); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); - } + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (flags & ENQUEUE_WAKEUP) + update_overutilized_status(rq); +enqueue_throttle: if (cfs_bandwidth_used()) { /* * When bandwidth control is enabled; the cfs_rq_throttled() -- cgit v1.2.3 From 12aa2587388de6697fd2e585ae6a90f70249540b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 7 May 2020 11:10:39 +0800 Subject: sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr() The cpuacct_charge() and cpuacct_account_field() are called with rq->lock held, and this means preemption(and IRQs) are indeed disabled, so it is safe to use __this_cpu_*() to allow for better code-generation. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200507031039.32615-1-songmuchun@bytedance.com --- kernel/sched/cpuacct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9fbb10383434..6448b0438ffb 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -347,7 +347,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + __this_cpu_add(ca->cpuusage->usages[index], cputime); rcu_read_unlock(); } @@ -363,7 +363,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpustat)->cpustat[index] += val; + __this_cpu_add(ca->cpustat->cpustat[index], val); rcu_read_unlock(); } -- cgit v1.2.3 From 95d685935a2edf209fc68f52494ede4a382a6c2b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 6 May 2020 17:53:01 +0200 Subject: sched/pelt: Sync util/runnable_sum with PELT window when propagating update_tg_cfs_*() propagate the impact of the attach/detach of an entity down into the cfs_rq hierarchy and must keep the sync with the current pelt window. Even if we can't sync child cfs_rq and its group se, we can sync the group se and its parent cfs_rq with current position in the PELT window. In fact, we must keep them sync in order to stay also synced with others entities and group entities that are already attached to the cfs_rq. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200506155301.14288-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 49 +++++++++++++++++++++++++++---------------------- kernel/sched/pelt.c | 24 ++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e586863827b..44b0c8edc260 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3441,52 +3441,46 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* Nothing to update */ if (!delta) return; - /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. - */ - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + se->avg.util_sum = se->avg.util_avg * divider; /* Update parent cfs_rq utilization */ add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* Nothing to update */ if (!delta) return; - /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. - */ - /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + se->avg.runnable_sum = se->avg.runnable_avg * divider; /* Update parent cfs_rq runnable */ add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; } static inline void @@ -3496,19 +3490,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq unsigned long load_avg; u64 load_sum = 0; s64 delta_sum; + u32 divider; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + if (runnable_sum >= 0) { /* * Add runnable; clip at LOAD_AVG_MAX. Reflects that until * the CPU is saturated running == runnable. */ runnable_sum += se->avg.load_sum; - runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + runnable_sum = min_t(long, runnable_sum, divider); } else { /* * Estimate the new unweighted runnable_sum of the gcfs_rq by @@ -3533,7 +3534,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, LOAD_AVG_MAX); + load_avg = div_s64(load_sum, divider); delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; @@ -3697,6 +3698,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b647d04d9c8b..b4b1ff96642f 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa, return 1; } +/* + * When syncing *_avg with *_sum, we must take into account the current + * position in the PELT segment otherwise the remaining part of the segment + * will be considered as idle time whereas it's not yet elapsed and this will + * generate unwanted oscillation in the range [1002..1024[. + * + * The max value of *_sum varies with the position in the time segment and is + * equals to : + * + * LOAD_AVG_MAX*y + sa->period_contrib + * + * which can be simplified into: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024 + * + * The same care must be taken when a sched entity is added, updated or + * removed from a cfs_rq and we need to update sched_avg. Scheduler entities + * and the cfs rq, to which they are attached, have the same position in the + * time segment because they use the same clock. This means that we can use + * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity + * if it's more convenient. + */ static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { -- cgit v1.2.3 From 04f5c362ec6d3ff0e14f1c05230b550da7f528a4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:21:41 -0500 Subject: sched/fair: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200507192141.GA16183@embeddedor --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44b0c8edc260..01f94cf52783 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1094,7 +1094,7 @@ struct numa_group { * more by CPU use than by memory faults. */ unsigned long *faults_cpu; - unsigned long faults[0]; + unsigned long faults[]; }; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 21416b30c520..2bd2a222318a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1462,7 +1462,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) -- cgit v1.2.3 From dbe9337109c2705f08e6a00392f991eb2d2570a5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 20 Apr 2020 15:04:53 +0800 Subject: sched/cpuacct: Fix charge cpuacct.usage_sys The user_mode(task_pt_regs(tsk)) always return true for user thread, and false for kernel thread. So it means that the cpuacct.usage_sys is the time that kernel thread uses not the time that thread uses in the kernel mode. We can try get_irq_regs() first, if it is NULL, then we can fall back to task_pt_regs(). Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200420070453.76815-1-songmuchun@bytedance.com --- kernel/sched/cpuacct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 6448b0438ffb..941c28cf9738 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -5,6 +5,7 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include #include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ @@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = task_pt_regs(tsk); + struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); if (regs && user_mode(regs)) index = CPUACCT_STAT_USER; -- cgit v1.2.3 From d505b8af58912ae1e1a211fabc9995b19bd40828 Mon Sep 17 00:00:00 2001 From: Huaixin Chang Date: Sat, 25 Apr 2020 18:52:48 +0800 Subject: sched: Defend cfs and rt bandwidth quota against overflow When users write some huge number into cpu.cfs_quota_us or cpu.rt_runtime_us, overflow might happen during to_ratio() shifts of schedulable checks. to_ratio() could be altered to avoid unnecessary internal overflow, but min_cfs_quota_period is less than 1 << BW_SHIFT, so a cutoff would still be needed. Set a cap MAX_BW for cfs_quota_us and rt_runtime_us to prevent overflow. Signed-off-by: Huaixin Chang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Link: https://lkml.kernel.org/r/20200425105248.60093-1-changhuaixin@linux.alibaba.com --- kernel/sched/core.c | 8 ++++++++ kernel/sched/rt.c | 12 +++++++++++- kernel/sched/sched.h | 2 ++ 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 74fb89b5ce3e..fa905b6daafa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7379,6 +7379,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -7406,6 +7408,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota != RUNTIME_INF && quota > max_cfs_runtime) + return -EINVAL; + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..6d60ba21ed29 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -9,6 +9,8 @@ int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +/* More than 4 hours if BW_SHIFT equals 20. */ +static const u64 max_rt_runtime = MAX_BW; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg, if (rt_period == 0) return -EINVAL; + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) @@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void) return -EINVAL; if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || + ((u64)sysctl_sched_rt_runtime * + NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; return 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2bd2a222318a..f7ab6334e992 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1915,6 +1915,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -- cgit v1.2.3 From c50c75e9b87946499a62bffc021e95c87a1d57cd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 11 May 2020 15:12:27 -0500 Subject: perf/core: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200511201227.GA14041@embeddedor --- kernel/events/callchain.c | 2 +- kernel/events/internal.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..b1991043b7d8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -16,7 +16,7 @@ struct callchain_cpus_entries { struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; + struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index f16f66b6b655..fcbf5616a441 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -55,7 +55,7 @@ struct perf_buffer { void *aux_priv; struct perf_event_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); -- cgit v1.2.3 From db78538c75e49c09b002a2cd96a19ae0c39be771 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:58:04 -0500 Subject: locking/lockdep: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200507185804.GA15036@embeddedor --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..cfdff122905b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -489,7 +489,7 @@ struct lock_trace { struct hlist_node hash_entry; u32 hash; u32 nr_entries; - unsigned long entries[0] __aligned(sizeof(unsigned long)); + unsigned long entries[] __aligned(sizeof(unsigned long)); }; #define LOCK_TRACE_SIZE_IN_LONGS \ (sizeof(struct lock_trace) / sizeof(unsigned long)) -- cgit v1.2.3 From c143b7753b308d5f7d03b165a7fdff1dda1f271d Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 15 May 2020 10:08:28 +0000 Subject: ftrace: show debugging information when panic_on_warn set When an anomaly is detected in the function call modification code, ftrace_bug() is called to disable function tracing as well as give some warn and information that may help debug the problem. But currently, we call FTRACE_WARN_ON_ONCE() first in ftrace_bug(), so when panic_on_warn is set, we can't see the debugging information here. Call FTRACE_WARN_ON_ONCE() at the end of ftrace_bug() to ensure that the debugging information is displayed first. after this patch, the dmesg looks like: ------------[ ftrace bug ]------------ ftrace failed to modify [] bcm2835_handle_irq+0x4/0x58 actual: 1f:20:03:d5 Setting ftrace call site to call ftrace function ftrace record flags: 80000001 (1) expected tramp: ffff80001009d6f0 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1635 at kernel/trace/ftrace.c:2078 ftrace_bug+0x204/0x238 Kernel panic - not syncing: panic_on_warn set ... CPU: 2 PID: 1635 Comm: sh Not tainted 5.7.0-rc5-00033-gb922183867f5 #14 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1b0 show_stack+0x20/0x30 dump_stack+0xc0/0x10c panic+0x16c/0x368 __warn+0x120/0x160 report_bug+0xc8/0x160 bug_handler+0x28/0x98 brk_handler+0x70/0xd0 do_debug_exception+0xcc/0x1ac el1_sync_handler+0xe4/0x120 el1_sync+0x7c/0x100 ftrace_bug+0x204/0x238 Link: https://lkml.kernel.org/r/20200515100828.7091-1-cj.chengjian@huawei.com Signed-off-by: Cheng Jian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bd030b1b9514..cd39cbf3631a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2027,14 +2027,14 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) { unsigned long ip = rec ? rec->ip : 0; + pr_info("------------[ ftrace bug ]------------\n"); + switch (failed) { case -EFAULT: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); print_ip_sym(ip); break; case -EINVAL: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); print_ip_ins(" actual: ", (unsigned char *)ip); @@ -2045,12 +2045,10 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) } break; case -EPERM: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); print_ip_sym(ip); break; default: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } @@ -2077,6 +2075,8 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) ip = ftrace_get_addr_curr(rec); pr_cont("\n expected tramp: %lx\n", ip); } + + FTRACE_WARN_ON_ONCE(1); } static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) -- cgit v1.2.3 From fc9d276f22330e9322a9c592c71e0571810205f7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 7 May 2020 21:30:08 +0200 Subject: tracing/probe: reverse arguments to list_add Elsewhere in the file, the function trace_kprobe_has_same_kprobe uses a trace_probe_event.probes object as the second argument of list_for_each_entry, ie as a list head, while the list_for_each_entry iterates over the list fields of the trace_probe structures, making them the list elements. So, exchange the arguments on the list_add call to put the list head in the second argument. Since both list_head structures were just initialized, this problem did not cause any loss of information. Link: https://lkml.kernel.org/r/1588879808-24488-1-git-send-email-Julia.Lawall@inria.fr Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Acked-by: Masami Hiramatsu Signed-off-by: Julia Lawall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ab8b6436d53f..b8a928e925c7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1006,7 +1006,7 @@ int trace_probe_init(struct trace_probe *tp, const char *event, INIT_LIST_HEAD(&tp->event->class.fields); INIT_LIST_HEAD(&tp->event->probes); INIT_LIST_HEAD(&tp->list); - list_add(&tp->event->probes, &tp->list); + list_add(&tp->list, &tp->event->probes); call = trace_probe_event_call(tp); call->class = &tp->event->class; -- cgit v1.2.3 From 6797d97ab9d1b0ef94bf6063920669409dc2d730 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:13 +0200 Subject: trace: remove tracing_pipe_buf_ops tracing_pipe_buf_ops has identical ops to default_pipe_buf_ops, so use that instead. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/trace/trace.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..bc9783797d27 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6304,13 +6304,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, __free_page(spd->pages[idx]); } -static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, - .release = generic_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { @@ -6372,7 +6365,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, - .ops = &tracing_pipe_buf_ops, + .ops = &default_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; ssize_t ret; -- cgit v1.2.3 From 76887c256744740d6121af9bc4aa787712a1f694 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:14 +0200 Subject: fs: make the pipe_buf_operations ->steal operation optional Just return 1 for failure if it is not present. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc9783797d27..29fa25cfb6c2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7576,7 +7576,6 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, static const struct pipe_buf_operations buffer_pipe_buf_ops = { .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; -- cgit v1.2.3 From b8d9e7f2411b0744df2ec33e80d7698180fef21a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:15 +0200 Subject: fs: make the pipe_buf_operations ->confirm operation optional Just return 0 for success if it is not present. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/relay.c | 1 - kernel/trace/trace.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..c5ece4c2b04d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,7 +1177,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29fa25cfb6c2..63d1ab978435 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7574,7 +7574,6 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .get = buffer_pipe_buf_get, }; -- cgit v1.2.3 From c928f642c29a5ffb02e16f2430b42b876dde69de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:16 +0200 Subject: fs: rename pipe_buf ->steal to ->try_steal And replace the arcane return value convention with a simple bool where true means success and false means failure. [AV: braino fix folded in] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/relay.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index c5ece4c2b04d..5c2263f3f857 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,9 +1177,9 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .release = relay_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, + .release = relay_pipe_buf_release, + .try_steal = generic_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -- cgit v1.2.3 From 87b047d2be417b271d80f5e490a825c6fd53ecad Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 16 Mar 2020 12:21:12 -0500 Subject: exec: Teach prepare_exec_creds how exec treats uids & gids It is almost possible to use the result of prepare_exec_creds with no modifications during exec. Update prepare_exec_creds to initialize the suid and the fsuid to the euid, and the sgid and the fsgid to the egid. This is all that is needed to handle the common case of exec when nothing special like a setuid exec is happening. That this preserves the existing behavior of exec can be verified by examing bprm_fill_uid and cap_bprm_set_creds. This change makes it clear that the later parts of exec that update bprm->cred are just need to handle special cases such as setuid exec and change of domains. Link: https://lkml.kernel.org/r/871rng22dm.fsf_-_@x220.int.ebiederm.org Acked-by: Linus Torvalds Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/cred.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 71a792616917..421b1149c651 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -315,6 +315,9 @@ struct cred *prepare_exec_creds(void) new->process_keyring = NULL; #endif + new->suid = new->fsuid = new->euid; + new->sgid = new->fsgid = new->egid; + return new; } -- cgit v1.2.3 From 9d44a121c5a79bc8a9d67c058456bd52a83c79e7 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 20 May 2020 14:47:13 -0400 Subject: audit: add subj creds to NETFILTER_CFG record to Some table unregister actions seem to be initiated by the kernel to garbage collect unused tables that are not initiated by any userspace actions. It was found to be necessary to add the subject credentials to cover this case to reveal the source of these actions. A sample record: The uid, auid, tty, ses and exe fields have not been included since they are in the SYSCALL record and contain nothing useful in the non-user context. Here are two sample orphaned records: type=NETFILTER_CFG msg=audit(2020-05-20 12:14:36.505:5) : table=filter family=ipv4 entries=0 op=register pid=1 subj=kernel comm=swapper/0 type=NETFILTER_CFG msg=audit(2020-05-20 12:15:27.701:301) : table=nat family=bridge entries=0 op=unregister pid=30 subj=system_u:system_r:kernel_t:s0 comm=kworker/u4:1 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cfe3486e5f31..468a23390457 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2557,12 +2557,18 @@ void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op) { struct audit_buffer *ab; + char comm[sizeof(current->comm)]; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); if (!ab) return; audit_log_format(ab, "table=%s family=%u entries=%u op=%s", name, af, nentries, audit_nfcfgs[op].s); + + audit_log_format(ab, " pid=%u", task_pid_nr(current)); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_end(ab); } EXPORT_SYMBOL_GPL(__audit_log_nfcfg); -- cgit v1.2.3 From 181e9d4efaf6aa8d1e7d510aeb7114c0f276fad7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 May 2020 19:49:25 +0300 Subject: irqdomain: Make __irq_domain_add() less OF-dependent __irq_domain_add() relies in some places on the fact that the fwnode can be only of type OF. This prevents refactoring of the code to support other types of fwnode. Make it less OF-dependent by switching it to use the fwnode directly where it makes sense. Signed-off-by: Andy Shevchenko Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200520164927.39090-1-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e2aa128ea3ee..7649f3848ab5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -132,14 +132,13 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { - struct device_node *of_node = to_of_node(fwnode); struct irqchip_fwid *fwid; struct irq_domain *domain; static atomic_t unknown_domains; domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), - GFP_KERNEL, of_node_to_nid(of_node)); + GFP_KERNEL, of_node_to_nid(to_of_node(fwnode))); if (!domain) return NULL; @@ -177,15 +176,15 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->fwnode = fwnode; #endif - } else if (of_node) { + } else if (is_of_node(fwnode)) { char *name; /* - * DT paths contain '/', which debugfs is legitimately + * fwnode paths contain '/', which debugfs is legitimately * unhappy about. Replace them with ':', which does * the trick and is not as offensive as '\'... */ - name = kasprintf(GFP_KERNEL, "%pOF", of_node); + name = kasprintf(GFP_KERNEL, "%pfw", fwnode); if (!name) { kfree(domain); return NULL; @@ -210,7 +209,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; } - of_node_get(of_node); + fwnode_handle_get(fwnode); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); @@ -259,7 +258,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - of_node_put(irq_domain_get_of_node(domain)); + fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); kfree(domain); -- cgit v1.2.3 From 87526603c89256e18ad2c23821fdaf376b072fc8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 May 2020 19:49:26 +0300 Subject: irqdomain: Get rid of special treatment for ACPI in __irq_domain_add() Now that __irq_domain_add() is able to better deals with generic fwnodes, there is no need to special-case ACPI anymore. Get rid of the special treatment for ACPI. Signed-off-by: Andy Shevchenko Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200520164927.39090-2-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 7649f3848ab5..5d14d910ab64 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -161,22 +161,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } -#ifdef CONFIG_ACPI - } else if (is_acpi_device_node(fwnode)) { - struct acpi_buffer buf = { - .length = ACPI_ALLOCATE_BUFFER, - }; - acpi_handle handle; - - handle = acpi_device_handle(to_acpi_device_node(fwnode)); - if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { - domain->name = buf.pointer; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } - - domain->fwnode = fwnode; -#endif - } else if (is_of_node(fwnode)) { + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode)) { char *name; /* -- cgit v1.2.3 From 9ed78b05f998050784ae863bd5ba4aea2e2141ed Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 May 2020 19:49:27 +0300 Subject: irqdomain: Allow software nodes for IRQ domain creation In some cases we need to have an IRQ domain created out of software node. One of such cases is DesignWare GPIO driver when it's instantiated from half-baked ACPI table (alas, we can't fix it for devices which are few years on market) and thus using software nodes to quirk this. But the driver is using IRQ domains based on per GPIO port firmware nodes, which are in the above case software ones. This brings a warning message to be printed [ 73.957183] irq: Invalid fwnode type for irqdomain and creates an anonymous IRQ domain without a debugfs entry. Allowing software nodes to be valid for IRQ domains rids us of the warning and debugs gets correctly populated. % ls -1 /sys/kernel/debug/irq/domains/ ... intel-quark-dw-apb-gpio:portA Signed-off-by: Andy Shevchenko [maz: refactored commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200520164927.39090-3-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5d14d910ab64..a4c2c915511d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -161,7 +161,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } - } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode)) { + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || + is_software_node(fwnode)) { char *name; /* -- cgit v1.2.3 From 325606af573152e02f44d791f152b7f9564bcb30 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Sat, 18 Apr 2020 19:35:36 +0800 Subject: printk: Fix a typo in comment "interator"->"iterator" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Cc: Steven Rostedt Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..525038745a14 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3360,7 +3360,7 @@ out: EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** - * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and @@ -3378,7 +3378,7 @@ void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) } /** - * kmsg_dump_rewind - reset the interator + * kmsg_dump_rewind - reset the iterator * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and -- cgit v1.2.3 From 8ece3b3eb576a78d2e67ad4c3a80a39fa6708809 Mon Sep 17 00:00:00 2001 From: Bruno Meneguele Date: Tue, 17 Mar 2020 07:33:44 -0300 Subject: kernel/printk: add kmsg SEEK_CUR handling Userspace libraries, e.g. glibc's dprintf(), perform a SEEK_CUR operation over any file descriptor requested to make sure the current position isn't pointing to junk due to previous manipulation of that same fd. And whenever that fd doesn't have support for such operation, the userspace code expects -ESPIPE to be returned. However, when the fd in question references the /dev/kmsg interface, the current kernel code state returns -EINVAL instead, causing an unexpected behavior in userspace: in the case of glibc, when -ESPIPE is returned it gets ignored and the call completes successfully, while returning -EINVAL forces dprintf to fail without performing any action over that fd: if (_IO_SEEKOFF (fp, (off64_t)0, _IO_seek_cur, _IOS_INPUT|_IOS_OUTPUT) == _IO_pos_BAD && errno != ESPIPE) return NULL; With this patch we make sure to return the correct value when SEEK_CUR is requested over kmsg and also add some kernel doc information to formalize this behavior. Link: https://lore.kernel.org/r/20200317103344.574277-1-bmeneg@redhat.com Cc: linux-kernel@vger.kernel.org Cc: rostedt@goodmis.org, Cc: David.Laight@ACULAB.COM Signed-off-by: Bruno Meneguele Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 525038745a14..35cc5f548860 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -974,6 +974,16 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) user->idx = log_next_idx; user->seq = log_next_seq; break; + case SEEK_CUR: + /* + * It isn't supported due to the record nature of this + * interface: _SET _DATA and _END point to very specific + * record positions, while _CUR would be more useful in case + * of a byte-based log. Because of that, return the default + * errno value for invalid seek operation. + */ + ret = -ESPIPE; + break; default: ret = -EINVAL; } -- cgit v1.2.3 From d20a1676df7e4c3c23d73299159811a50e4854bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 20 May 2020 21:20:50 +0200 Subject: xsk: Move xskmap.c to net/xdp/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP is partly implemented by net/xdp/xsk.c. Move xskmap.c from kernel/bpf/ to net/xdp/, which is the logical place for AF_XDP related code. Also, move AF_XDP struct definitions, and function declarations only used by AF_XDP internals into net/xdp/xsk.h. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-3-bjorn.topel@gmail.com --- kernel/bpf/Makefile | 3 - kernel/bpf/xskmap.c | 265 ---------------------------------------------------- 2 files changed, 268 deletions(-) delete mode 100644 kernel/bpf/xskmap.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 37b2d8620153..375b933010dd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -ifeq ($(CONFIG_XDP_SOCKETS),y) -obj-$(CONFIG_BPF_SYSCALL) += xskmap.o -endif obj-$(CONFIG_BPF_SYSCALL) += offload.o endif ifeq ($(CONFIG_PERF_EVENTS),y) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c deleted file mode 100644 index 2cc5c8f4c800..000000000000 --- a/kernel/bpf/xskmap.c +++ /dev/null @@ -1,265 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* XSKMAP used for AF_XDP sockets - * Copyright(c) 2018 Intel Corporation. - */ - -#include -#include -#include -#include -#include - -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - -static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *node; - int err; - - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); - if (!node) - return ERR_PTR(-ENOMEM); - - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } - - node->map = map; - node->map_entry = map_entry; - return node; -} - -static void xsk_map_node_free(struct xsk_map_node *node) -{ - xsk_map_put(node->map); - kfree(node); -} - -static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) -{ - spin_lock_bh(&xs->map_list_lock); - list_add_tail(&node->node, &xs->map_list); - spin_unlock_bh(&xs->map_list_lock); -} - -static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *n, *tmp; - - spin_lock_bh(&xs->map_list_lock); - list_for_each_entry_safe(n, tmp, &xs->map_list, node) { - if (map_entry == n->map_entry) { - list_del(&n->node); - xsk_map_node_free(n); - } - } - spin_unlock_bh(&xs->map_list_lock); -} - -static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) -{ - struct bpf_map_memory mem; - int err, numa_node; - struct xsk_map *m; - u64 size; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || - attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) - return ERR_PTR(-EINVAL); - - numa_node = bpf_map_attr_numa_node(attr); - size = struct_size(m, xsk_map, attr->max_entries); - - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); - return ERR_PTR(-ENOMEM); - } - - bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); - spin_lock_init(&m->lock); - - return &m->map; -} - -static void xsk_map_free(struct bpf_map *map) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - - bpf_clear_redirect_map(map); - synchronize_net(); - bpf_map_area_free(m); -} - -static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = next_key; - - if (index >= m->map.max_entries) { - *next = 0; - return 0; - } - - if (index == m->map.max_entries - 1) - return -ENOENT; - *next = index + 1; - return 0; -} - -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) -{ - const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; - struct bpf_insn *insn = insn_buf; - - *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); - *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); - *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); - *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); - *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *insn++ = BPF_MOV64_IMM(ret, 0); - return insn - insn_buf; -} - -static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __xsk_map_lookup_elem(map, *(u32 *)key); -} - -static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; - u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xsk_map_node *node; - struct socket *sock; - int err; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= m->map.max_entries)) - return -E2BIG; - - sock = sockfd_lookup(fd, &err); - if (!sock) - return err; - - if (sock->sk->sk_family != PF_XDP) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - xs = (struct xdp_sock *)sock->sk; - - if (!xsk_is_setup_for_bpf_map(xs)) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - map_entry = &m->xsk_map[i]; - node = xsk_map_node_alloc(m, map_entry); - if (IS_ERR(node)) { - sockfd_put(sock); - return PTR_ERR(node); - } - - spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); - if (old_xs == xs) { - err = 0; - goto out; - } else if (old_xs && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto out; - } else if (!old_xs && map_flags == BPF_EXIST) { - err = -ENOENT; - goto out; - } - xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - sockfd_put(sock); - return 0; - -out: - spin_unlock_bh(&m->lock); - sockfd_put(sock); - xsk_map_node_free(node); - return err; -} - -static int xsk_map_delete_elem(struct bpf_map *map, void *key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; - int k = *(u32 *)key; - - if (k >= map->max_entries) - return -EINVAL; - - spin_lock_bh(&m->lock); - map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - - return 0; -} - -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); - xsk_map_sock_delete(xs, map_entry); - } - spin_unlock_bh(&map->lock); -} - -const struct bpf_map_ops xsk_map_ops = { - .map_alloc = xsk_map_alloc, - .map_free = xsk_map_free, - .map_get_next_key = xsk_map_get_next_key, - .map_lookup_elem = xsk_map_lookup_elem, - .map_gen_lookup = xsk_map_gen_lookup, - .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, - .map_update_elem = xsk_map_update_elem, - .map_delete_elem = xsk_map_delete_elem, - .map_check_btf = map_check_no_btf, -}; -- cgit v1.2.3 From cac616db39c207dc63465a4e05c6ce0e60b2cce4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 May 2020 13:07:26 -0700 Subject: bpf: Verifier track null pointer branch_taken with JNE and JEQ Currently, when considering the branches that may be taken for a jump instruction if the register being compared is a pointer the verifier assumes both branches may be taken. But, if the jump instruction is comparing if a pointer is NULL we have this information in the verifier encoded in the reg->type so we can do better in these cases. Specifically, these two common cases can be handled. * If the instruction is BPF_JEQ and we are comparing against a zero value. This test is 'if ptr == 0 goto +X' then using the type information in reg->type we can decide if the ptr is not null. This allows us to avoid pushing both branches onto the stack and instead only use the != 0 case. For example PTR_TO_SOCK and PTR_TO_SOCK_OR_NULL encode the null pointer. Note if the type is PTR_TO_SOCK_OR_NULL we can not learn anything. And also if the value is non-zero we learn nothing because it could be any arbitrary value a different pointer for example * If the instruction is BPF_JNE and ware comparing against a zero value then a similar analysis as above can be done. The test in asm looks like 'if ptr != 0 goto +X'. Again using the type information if the non null type is set (from above PTR_TO_SOCK) we know the jump is taken. In this patch we extend is_branch_taken() to consider this extra information and to return only the branch that will be taken. This resolves a verifier issue reported with C code like the following. See progs/test_sk_lookup_kern.c in selftests. sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); bpf_printk("sk=%d\n", sk ? 1 : 0); if (sk) bpf_sk_release(sk); return sk ? TC_ACT_OK : TC_ACT_UNSPEC; In the above the bpf_printk() will resolve the pointer from PTR_TO_SOCK_OR_NULL to PTR_TO_SOCK. Then the second test guarding the release will cause the verifier to walk both paths resulting in the an unreleased sock reference. See verifier/ref_tracking.c in selftests for an assembly version of the above. After the above additional logic is added the C code above passes as expected. Reported-by: Andrey Ignatov Suggested-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159009164651.6313.380418298578070501.stgit@john-Precision-5820-Tower --- kernel/bpf/verifier.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ed8351f47a4..d2e27dba4ac6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -393,6 +393,15 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool reg_type_not_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_MAP_VALUE || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_BTF_ID; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || @@ -6308,8 +6317,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { - if (__is_pointer_value(false, reg)) - return -1; + if (__is_pointer_value(false, reg)) { + if (!reg_type_not_null(reg->type)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } if (is_jmp32) return is_branch32_taken(reg, val, opcode); @@ -6808,7 +6834,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (pred >= 0) { - err = mark_chain_precision(env, insn->dst_reg); + /* If we get here with a dst_reg pointer type it is because + * above is_branch_taken() special cased the 0 comparison. + */ + if (!__is_pointer_value(false, dst_reg)) + err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err) err = mark_chain_precision(env, insn->src_reg); if (err) -- cgit v1.2.3 From 48021f98130880dd74286459a1ef48b5e9bc374f Mon Sep 17 00:00:00 2001 From: Shreyas Joshi Date: Fri, 22 May 2020 16:53:06 +1000 Subject: printk: handle blank console arguments passed in. If uboot passes a blank string to console_setup then it results in a trashed memory. Ultimately, the kernel crashes during freeing up the memory. This fix checks if there is a blank parameter being passed to console_setup from uboot. In case it detects that the console parameter is blank then it doesn't setup the serial device and it gracefully exits. Link: https://lore.kernel.org/r/20200522065306.83-1-shreyas.joshi@biamp.com Signed-off-by: Shreyas Joshi Acked-by: Sergey Senozhatsky [pmladek@suse.com: Better format the commit message and code, remove unnecessary brackets.] Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 35cc5f548860..a3990505abdf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2200,6 +2200,9 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; + if (str[0] == 0) + return 1; + if (_braille_console_setup(&str, &brl_options)) return 1; -- cgit v1.2.3 From c6e7bd7afaeb3af55ffac122828035f1c01d1d7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 24 May 2020 21:29:55 +0100 Subject: sched/core: Optimize ttwu() spinning on p->on_cpu Both Rik and Mel reported seeing ttwu() spend significant time on: smp_cond_load_acquire(&p->on_cpu, !VAL); Attempt to avoid this by queueing the wakeup on the CPU that owns the p->on_cpu value. This will then allow the ttwu() to complete without further waiting. Since we run schedule() with interrupts disabled, the IPI is guaranteed to happen after p->on_cpu is cleared, this is what makes it safe to queue early. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Cc: Jirka Hladky Cc: Vincent Guittot Cc: valentin.schneider@arm.com Cc: Hillf Danton Cc: Rik van Riel Link: https://lore.kernel.org/r/20200524202956.27665-2-mgorman@techsingularity.net --- kernel/sched/core.c | 52 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa905b6daafa..903c9ee1db45 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2312,7 +2312,7 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } -static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +static void __ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -2354,6 +2354,17 @@ bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } + +static bool ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +{ + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ + __ttwu_queue_remote(p, cpu, wake_flags); + return true; + } + + return false; +} #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2362,11 +2373,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq_flags rf; #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - ttwu_queue_remote(p, cpu, wake_flags); + if (ttwu_queue_remote(p, cpu, wake_flags)) return; - } #endif rq_lock(rq, &rf); @@ -2548,7 +2556,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->on_rq && ttwu_remote(p, wake_flags)) goto unlock; + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + #ifdef CONFIG_SMP + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2570,6 +2586,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + */ + if (READ_ONCE(p->on_cpu) && ttwu_queue_remote(p, cpu, wake_flags)) + goto unlock; + /* * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. @@ -2581,28 +2607,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } - -#else /* CONFIG_SMP */ - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); -- cgit v1.2.3 From 2ebb17717550607bcd85fb8cf7d24ac870e9d762 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sun, 24 May 2020 21:29:56 +0100 Subject: sched/core: Offload wakee task activation if it the wakee is descheduling The previous commit: c6e7bd7afaeb: ("sched/core: Optimize ttwu() spinning on p->on_cpu") avoids spinning on p->on_rq when the task is descheduling, but only if the wakee is on a CPU that does not share cache with the waker. This patch offloads the activation of the wakee to the CPU that is about to go idle if the task is the only one on the runqueue. This potentially allows the waker task to continue making progress when the wakeup is not strictly synchronous. This is very obvious with netperf UDP_STREAM running on localhost. The waker is sending packets as quickly as possible without waiting for any reply. It frequently wakes the server for the processing of packets and when netserver is using local memory, it quickly completes the processing and goes back to idle. The waker often observes that netserver is on_rq and spins excessively leading to a drop in throughput. This is a comparison of 5.7-rc6 against "sched: Optimize ttwu() spinning on p->on_cpu" and against this patch labeled vanilla, optttwu-v1r1 and localwakelist-v1r2 respectively. 5.7.0-rc6 5.7.0-rc6 5.7.0-rc6 vanilla optttwu-v1r1 localwakelist-v1r2 Hmean send-64 251.49 ( 0.00%) 258.05 * 2.61%* 305.59 * 21.51%* Hmean send-128 497.86 ( 0.00%) 519.89 * 4.43%* 600.25 * 20.57%* Hmean send-256 944.90 ( 0.00%) 997.45 * 5.56%* 1140.19 * 20.67%* Hmean send-1024 3779.03 ( 0.00%) 3859.18 * 2.12%* 4518.19 * 19.56%* Hmean send-2048 7030.81 ( 0.00%) 7315.99 * 4.06%* 8683.01 * 23.50%* Hmean send-3312 10847.44 ( 0.00%) 11149.43 * 2.78%* 12896.71 * 18.89%* Hmean send-4096 13436.19 ( 0.00%) 13614.09 ( 1.32%) 15041.09 * 11.94%* Hmean send-8192 22624.49 ( 0.00%) 23265.32 * 2.83%* 24534.96 * 8.44%* Hmean send-16384 34441.87 ( 0.00%) 36457.15 * 5.85%* 35986.21 * 4.48%* Note that this benefit is not universal to all wakeups, it only applies to the case where the waker often spins on p->on_rq. The impact can be seen from a "perf sched latency" report generated from a single iteration of one packet size: ----------------------------------------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at | ----------------------------------------------------------------------------------------------------------------- vanilla netperf:4337 | 21709.193 ms | 2932 | avg: 0.002 ms | max: 0.041 ms | max at: 112.154512 s netserver:4338 | 14629.459 ms | 5146990 | avg: 0.001 ms | max: 1615.864 ms | max at: 140.134496 s localwakelist-v1r2 netperf:4339 | 29789.717 ms | 2460 | avg: 0.002 ms | max: 0.059 ms | max at: 138.205389 s netserver:4340 | 18858.767 ms | 7279005 | avg: 0.001 ms | max: 0.362 ms | max at: 135.709683 s ----------------------------------------------------------------------------------------------------------------- Note that the average wakeup delay is quite small on both the vanilla kernel and with the two patches applied. However, there are significant outliers with the vanilla kernel with the maximum one measured as 1615 milliseconds with a vanilla kernel but never worse than 0.362 ms with both patches applied and a much higher rate of context switching. Similarly a separate profile of cycles showed that 2.83% of all cycles were spent in try_to_wake_up() with almost half of the cycles spent on spinning on p->on_rq. With the two patches, the percentage of cycles spent in try_to_wake_up() drops to 1.13% Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Jirka Hladky Cc: Vincent Guittot Cc: valentin.schneider@arm.com Cc: Hillf Danton Cc: Rik van Riel Link: https://lore.kernel.org/r/20200524202956.27665-3-mgorman@techsingularity.net --- kernel/sched/core.c | 39 +++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 3 ++- 2 files changed, 35 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 903c9ee1db45..6130ab170e93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2312,7 +2312,13 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } -static void __ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +/* + * Queue a task on the target CPUs wake_list and wake the CPU via IPI if + * necessary. The wakee CPU on receipt of the IPI will queue the task + * via sched_ttwu_wakeup() for activation so the wakee incurs the cost + * of the wakeup instead of the waker. + */ +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -2355,11 +2361,32 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static bool ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +static inline bool ttwu_queue_cond(int cpu, int wake_flags) +{ + /* + * If the CPU does not share cache, then queue the task on the + * remote rqs wakelist to avoid accessing remote data. + */ + if (!cpus_share_cache(smp_processor_id(), cpu)) + return true; + + /* + * If the task is descheduling and the only running task on the + * CPU then use the wakelist to offload the task activation to + * the soon-to-be-idle CPU as the current CPU is likely busy. + * nr_running is checked to avoid unnecessary task stacking. + */ + if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + return true; + + return false; +} + +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - __ttwu_queue_remote(p, cpu, wake_flags); + __ttwu_queue_wakelist(p, cpu, wake_flags); return true; } @@ -2373,7 +2400,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq_flags rf; #if defined(CONFIG_SMP) - if (ttwu_queue_remote(p, cpu, wake_flags)) + if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; #endif @@ -2593,7 +2620,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * let the waker make forward progress. This is safe because IRQs are * disabled and the IPI will deliver after on_cpu is cleared. */ - if (READ_ONCE(p->on_cpu) && ttwu_queue_remote(p, cpu, wake_flags)) + if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) goto unlock; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f7ab6334e992..4b32cff0dcbe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1685,7 +1685,8 @@ static inline int task_on_rq_migrating(struct task_struct *p) */ #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_RQ 0x08 /* Wakee is on_rq */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution -- cgit v1.2.3 From aaf2bc50df1f4bfc6857fc601fc7b21d5a18c6a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 May 2020 22:05:15 +0200 Subject: rcu: Abstract out rcu_irq_enter_check_tick() from rcu_nmi_enter() There will likely be exception handlers that can sleep, which rules out the usual approach of invoking rcu_nmi_enter() on entry and also rcu_nmi_exit() on all exit paths. However, the alternative approach of just not calling anything can prevent RCU from coaxing quiescent states from nohz_full CPUs that are looping in the kernel: RCU must instead IPI them explicitly. It would be better to enable the scheduler tick on such CPUs to interact with RCU in a lighter-weight manner, and this enabling is one of the things that rcu_nmi_enter() currently does. What is needed is something that helps RCU coax quiescent states while not preventing subsequent sleeps. This commit therefore splits out the nohz_full scheduler-tick enabling from the rest of the rcu_nmi_enter() logic into a new function named rcu_irq_enter_check_tick(). [ tglx: Renamed the function and made it a nop when context tracking is off ] [ mingo: Fixed a CONFIG_NO_HZ_FULL assumption, harmonized and fixed all the comment blocks and cleaned up rcu_nmi_enter()/exit() definitions. ] Suggested-by: Andy Lutomirski Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200521202116.996113173@linutronix.de --- kernel/rcu/tree.c | 82 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90c8be22d57a..b7f8c494d1d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -848,6 +848,67 @@ void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } + +/** + * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. + * + * The scheduler tick is not normally enabled when CPUs enter the kernel + * from nohz_full userspace execution. After all, nohz_full userspace + * execution is an RCU quiescent state and the time executing in the kernel + * is quite short. Except of course when it isn't. And it is not hard to + * cause a large system to spend tens of seconds or even minutes looping + * in the kernel, which can cause a number of problems, include RCU CPU + * stall warnings. + * + * Therefore, if a nohz_full CPU fails to report a quiescent state + * in a timely manner, the RCU grace-period kthread sets that CPU's + * ->rcu_urgent_qs flag with the expectation that the next interrupt or + * exception will invoke this function, which will turn on the scheduler + * tick, which will enable RCU to detect that CPU's quiescent states, + * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. + * The tick will be disabled once a quiescent state is reported for + * this CPU. + * + * Of course, in carefully tuned systems, there might never be an + * interrupt or exception. In that case, the RCU grace-period kthread + * will eventually cause one to happen. However, in less carefully + * controlled environments, this function allows RCU to get what it + * needs without creating otherwise useless interruptions. + */ +void __rcu_irq_enter_check_tick(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + // Enabling the tick is unsafe in NMI handlers. + if (WARN_ON_ONCE(in_nmi())) + return; + + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); + + if (!tick_nohz_full_cpu(rdp->cpu) || + !READ_ONCE(rdp->rcu_urgent_qs) || + READ_ONCE(rdp->rcu_forced_tick)) { + // RCU doesn't need nohz_full help from this CPU, or it is + // already getting that help. + return; + } + + // We get here only when not in an extended quiescent state and + // from interrupts (as opposed to NMIs). Therefore, (1) RCU is + // already watching and (2) The fact that we are in an interrupt + // handler and that the rcu_node lock is an irq-disabled lock + // prevents self-deadlock. So we can safely recheck under the lock. + // Note that the nohz_full state currently cannot change. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU needs a + // quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); +} #endif /* CONFIG_NO_HZ_FULL */ /** @@ -894,26 +955,7 @@ noinstr void rcu_nmi_enter(void) incby = 1; } else if (!in_nmi()) { instrumentation_begin(); - if (tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - // We get here only if we had already exited the - // extended quiescent state and this was an - // interrupt (not an NMI). Therefore, (1) RCU is - // already watching and (2) The fact that we are in - // an interrupt handler and that the rcu_node lock - // is an irq-disabled lock prevents self-deadlock. - // So we can safely recheck under the lock. - raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - // A nohz_full CPU is in the kernel and RCU - // needs a quiescent state. Turn on the tick! - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - } - raw_spin_unlock_rcu_node(rdp->mynode); - } + rcu_irq_enter_check_tick(); instrumentation_end(); } instrumentation_begin(); -- cgit v1.2.3 From 07325d4a90d2d84de45cc07b134fd0f023dbb971 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:16 +0200 Subject: rcu: Provide rcu_irq_exit_check_preempt() Provide a debug check which can be invoked from exception return to kernel mode before an attempt is made to schedule. Warn if RCU is not ready for this. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Paul E. McKenney Link: https://lore.kernel.org/r/20200521202117.089709607@linutronix.de --- kernel/rcu/tree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b7f8c494d1d1..d8e9dbbefcfa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -765,6 +765,24 @@ void rcu_irq_exit_preempt(void) "RCU in extended quiescent state!"); } +#ifdef CONFIG_PROVE_RCU +/** + * rcu_irq_exit_check_preempt - Validate that scheduling is possible + */ +void rcu_irq_exit_check_preempt(void) +{ + lockdep_assert_irqs_disabled(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} +#endif /* #ifdef CONFIG_PROVE_RCU */ + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * -- cgit v1.2.3 From 6b6ebb34744b21467aa01be7c53cc570fc41f70d Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Wed, 13 May 2020 10:13:11 +0800 Subject: cgroup: Remove stale comments - The default root is where we can create v2 cgroups. - The __DEVEL__sane_behavior mount option has been removed long long ago. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..7a016749de21 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -153,11 +153,7 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); -/* - * The default hierarchy, reserved for the subsystems that are otherwise - * unattached - it never has more than a single cgroup, and all tasks are - * part of that cgroup. - */ +/* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); @@ -251,9 +247,6 @@ bool cgroup_ssid_enabled(int ssid) * cases where a subsystem should behave differnetly depending on the * interface version. * - * The set of behaviors which change on the default hierarchy are still - * being determined and the mount option is prefixed with __DEVEL__. - * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" -- cgit v1.2.3 From 3234ac664a870e6ea69ae3a57d824cd7edbeacc5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 May 2020 14:06:17 -0700 Subject: /dev/mem: Revoke mappings when a driver claims the region Close the hole of holding a mapping over kernel driver takeover event of a given address range. Commit 90a545e98126 ("restrict /dev/mem to idle io memory ranges") introduced CONFIG_IO_STRICT_DEVMEM with the goal of protecting the kernel against scenarios where a /dev/mem user tramples memory that a kernel driver owns. However, this protection only prevents *new* read(), write() and mmap() requests. Established mappings prior to the driver calling request_mem_region() are left alone. Especially with persistent memory, and the core kernel metadata that is stored there, there are plentiful scenarios for a /dev/mem user to violate the expectations of the driver and cause amplified damage. Teach request_mem_region() to find and shoot down active /dev/mem mappings that it believes it has successfully claimed for the exclusive use of the driver. Effectively a driver call to request_mem_region() becomes a hole-punch on the /dev/mem device. The typical usage of unmap_mapping_range() is part of truncate_pagecache() to punch a hole in a file, but in this case the implementation is only doing the "first half" of a hole punch. Namely it is just evacuating current established mappings of the "hole", and it relies on the fact that /dev/mem establishes mappings in terms of absolute physical address offsets. Once existing mmap users are invalidated they can attempt to re-establish the mapping, or attempt to continue issuing read(2) / write(2) to the invalidated extent, but they will then be subject to the CONFIG_IO_STRICT_DEVMEM checking that can block those subsequent accesses. Cc: Arnd Bergmann Cc: Ingo Molnar Cc: Kees Cook Cc: Matthew Wilcox Cc: Russell King Cc: Andrew Morton Cc: Greg Kroah-Hartman Fixes: 90a545e98126 ("restrict /dev/mem to idle io memory ranges") Signed-off-by: Dan Williams Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 76036a41143b..841737bbda9e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1126,6 +1126,7 @@ struct resource * __request_region(struct resource *parent, { DECLARE_WAITQUEUE(wait, current); struct resource *res = alloc_resource(GFP_KERNEL); + struct resource *orig_parent = parent; if (!res) return NULL; @@ -1176,6 +1177,10 @@ struct resource * __request_region(struct resource *parent, break; } write_unlock(&resource_lock); + + if (res && orig_parent == &iomem_resource) + revoke_devmem(res); + return res; } EXPORT_SYMBOL(__request_region); -- cgit v1.2.3 From 342ed2400b78072cc01c0130ce41240dec60d56d Mon Sep 17 00:00:00 2001 From: Zhang Qiang Date: Wed, 27 May 2020 15:57:15 +0800 Subject: workqueue: Remove unnecessary kfree() call in rcu_free_wq() The data structure member "wq->rescuer" was reset to a null pointer in one if branch. It was passed to a call of the function "kfree" in the callback function "rcu_free_wq" (which was eventually executed). The function "kfree" does not perform more meaningful data processing for a passed null pointer (besides immediately returning from such a call). Thus delete this function call which became unnecessary with the referenced software update. Fixes: def98c84b6cd ("workqueue: Fix spurious sanity check failures in destroy_workqueue()") Suggested-by: Markus Elfring Signed-off-by: Zhang Qiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 10ed8d761e0b..7a1fc9fe6314 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3491,7 +3491,6 @@ static void rcu_free_wq(struct rcu_head *rcu) else free_workqueue_attrs(wq->unbound_attrs); - kfree(wq->rescuer); kfree(wq); } -- cgit v1.2.3 From ad1e4f74c072eaa2c6d77dd710db31aafecd614f Mon Sep 17 00:00:00 2001 From: Domenico Andreoli Date: Tue, 19 May 2020 20:14:10 +0200 Subject: PM: hibernate: Restrict writes to the resume device Hibernation via snapshot device requires write permission to the swap block device, the one that more often (but not necessarily) is used to store the hibernation image. With this patch, such permissions are granted iff: 1) snapshot device config option is enabled 2) swap partition is used as resume device In other circumstances the swap device is not writable from userspace. In order to achieve this, every write attempt to a swap device is checked against the device configured as part of the uswsusp API [0] using a pointer to the inode struct in memory. If the swap device being written was not configured for resuming, the write request is denied. NOTE: this implementation works only for swap block devices, where the inode configured by swapon (which sets S_SWAPFILE) is the same used by SNAPSHOT_SET_SWAP_AREA. In case of swap file, SNAPSHOT_SET_SWAP_AREA indeed receives the inode of the block device containing the filesystem where the swap file is located (+ offset in it) which is never passed to swapon and then has not set S_SWAPFILE. As result, the swap file itself (as a file) has never an option to be written from userspace. Instead it remains writable if accessed directly from the containing block device, which is always writeable from root. [0] Documentation/power/userland-swsusp.rst v2: - rename is_hibernate_snapshot_dev() to is_hibernate_resume_dev() - fix description so to correctly refer to the resume device Signed-off-by: Domenico Andreoli Acked-by: Darrick J. Wong Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 98548d1cf8a6..d5eedc2baa2a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,8 +35,14 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; + struct inode *bd_inode; } snapshot_state; +int is_hibernate_resume_dev(const struct inode *bd_inode) +{ + return hibernation_available() && snapshot_state.bd_inode == bd_inode; +} + static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; @@ -95,6 +101,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->frozen = false; data->ready = false; data->platform_support = false; + data->bd_inode = NULL; Unlock: unlock_system_sleep(); @@ -110,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; + data->bd_inode = NULL; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -202,6 +210,7 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { + struct block_device *bdev; sector_t offset; dev_t swdev; @@ -232,9 +241,12 @@ static int snapshot_set_swap_area(struct snapshot_data *data, data->swap = -1; return -EINVAL; } - data->swap = swap_type_of(swdev, offset, NULL); + data->swap = swap_type_of(swdev, offset, &bdev); if (data->swap < 0) return -ENODEV; + + data->bd_inode = bdev->bd_inode; + bdput(bdev); return 0; } -- cgit v1.2.3 From 806f04e9fd2c6ad1e39bc2dba77155be0e4becde Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 19:12:36 +0200 Subject: rcu: Allow for smp_call_function() running callbacks from idle Current RCU hard relies on smp_call_function() callbacks running from interrupt context. A pending optimization is going to break that, it will allow idle CPUs to run the callbacks from the idle loop. This avoids raising the IPI on the requesting CPU and avoids handling an exception on the receiving CPU. Change rcu_is_cpu_rrupt_from_idle() to also accept task context, provided it is the idle task. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20200527171236.GC706495@hirez.programming.kicks-ass.net --- kernel/rcu/tree.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90c8be22d57a..f51385b86ea3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -418,16 +418,23 @@ void rcu_momentary_dyntick_idle(void) EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); /** - * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle * * If the current CPU is idle and running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. + * interrupt, or directly, from idle, return true. + * + * The caller must have at least disabled IRQs. */ static int rcu_is_cpu_rrupt_from_idle(void) { - /* Called only from within the scheduling-clock interrupt */ - lockdep_assert_in_irq(); + long nesting; + + /* + * Usually called from the tick; but also used from smp_function_call() + * for expedited grace periods. This latter can result in running from + * the idle task, instead of an actual IPI. + */ + lockdep_assert_irqs_disabled(); /* Check for counter underflows */ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, @@ -436,9 +443,15 @@ static int rcu_is_cpu_rrupt_from_idle(void) "RCU dynticks_nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting); + if (nesting > 1) return false; + /* + * If we're not in an interrupt, we must be in the idle task! + */ + WARN_ON_ONCE(!nesting && !is_idle_task(current)); + /* Does CPU appear to be idle from an RCU standpoint? */ return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -- cgit v1.2.3 From 19a1f5ec699954d21be10f74ff71c2a7079e99ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:10:58 +0200 Subject: sched: Fix smp_call_function_single_async() usage for ILB The recent commit: 90b5363acd47 ("sched: Clean up scheduler_ipi()") got smp_call_function_single_async() subtly wrong. Even though it will return -EBUSY when trying to re-use a csd, that condition is not atomic and still requires external serialization. The change in kick_ilb() got this wrong. While on first reading kick_ilb() has an atomic test-and-set that appears to serialize the use, the matching 'release' is not in the right place to actually guarantee this serialization. Rework the nohz_idle_balance() trigger so that the release is in the IPI callback and thus guarantees the required serialization for the CSD. Fixes: 90b5363acd47 ("sched: Clean up scheduler_ipi()") Reported-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Cc: mgorman@techsingularity.net Link: https://lore.kernel.org/r/20200526161907.778543557@infradead.org --- kernel/sched/core.c | 36 ++++++++++-------------------------- kernel/sched/fair.c | 18 ++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 19 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95e457d4ed1c..2cacc1e44a84 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -637,41 +637,25 @@ void wake_up_nohz_cpu(int cpu) wake_up_idle_cpu(cpu); } -static inline bool got_nohz_idle_kick(void) +static void nohz_csd_func(void *info) { - int cpu = smp_processor_id(); - - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) - return false; - - if (idle_cpu(cpu) && !need_resched()) - return true; + struct rq *rq = info; + int cpu = cpu_of(rq); + unsigned int flags; /* - * We can't run Idle Load Balance on this CPU for this time so we - * cancel it and clear NOHZ_BALANCE_KICK + * Release the rq::nohz_csd. */ - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); - return false; -} + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); + WARN_ON(!(flags & NOHZ_KICK_MASK)); -static void nohz_csd_func(void *info) -{ - struct rq *rq = info; - - if (got_nohz_idle_kick()) { - rq->idle_balance = 1; + rq->idle_balance = idle_cpu(cpu); + if (rq->idle_balance && !need_resched()) { + rq->nohz_idle_balance = flags; raise_softirq_irqoff(SCHED_SOFTIRQ); } } -#else /* CONFIG_NO_HZ_COMMON */ - -static inline bool got_nohz_idle_kick(void) -{ - return false; -} - #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dda9b194d225..2890bd54a088 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10024,6 +10024,10 @@ static void kick_ilb(unsigned int flags) if (ilb_cpu >= nr_cpu_ids) return; + /* + * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets + * the first flag owns it; cleared by nohz_csd_func(). + */ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return; @@ -10371,20 +10375,14 @@ abort: */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - int this_cpu = this_rq->cpu; - unsigned int flags; + unsigned int flags = this_rq->nohz_idle_balance; - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + if (!flags) return false; - if (idle != CPU_IDLE) { - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - return false; - } + this_rq->nohz_idle_balance = 0; - /* could be _relaxed() */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - if (!(flags & NOHZ_KICK_MASK)) + if (idle != CPU_IDLE) return false; _nohz_idle_balance(this_rq, flags, idle); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4b32cff0dcbe..3c163cb5493f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -951,6 +951,7 @@ struct rq { struct callback_head *balance_callback; + unsigned char nohz_idle_balance; unsigned char idle_balance; unsigned long misfit_task_load; -- cgit v1.2.3 From 52103be07d8b08311955f8c30e535c2dda290cf4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:10:59 +0200 Subject: smp: Optimize flush_smp_call_function_queue() The call_single_queue can contain (two) different callbacks, synchronous and asynchronous. The current interrupt handler runs them in-order, which means that remote CPUs that are waiting for their synchronous call can be delayed by running asynchronous callbacks. Rework the interrupt handler to first run the synchonous callbacks. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161907.836818381@infradead.org --- kernel/smp.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 786092aabdcd..db2f73808db5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -209,9 +209,9 @@ void generic_smp_call_function_single_interrupt(void) */ static void flush_smp_call_function_queue(bool warn_cpu_offline) { - struct llist_head *head; - struct llist_node *entry; call_single_data_t *csd, *csd_next; + struct llist_node *entry, *prev; + struct llist_head *head; static bool warned; lockdep_assert_irqs_disabled(); @@ -235,20 +235,39 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd->func); } + /* + * First; run all SYNC callbacks, people are waiting for us. + */ + prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { smp_call_func_t func = csd->func; void *info = csd->info; /* Do we wait until *after* callback? */ if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } func(info); csd_unlock(csd); } else { - csd_unlock(csd); - func(info); + prev = &csd->llist; } } + /* + * Second; run all !SYNC callbacks. + */ + llist_for_each_entry_safe(csd, csd_next, entry, llist) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } + /* * Handle irq works queued remotely by irq_work_queue_on(). * Smp functions above are typically synchronous so they -- cgit v1.2.3 From afaa653c564da38c0b34c4baba31e88c46a8764c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:00 +0200 Subject: smp: Move irq_work_run() out of flush_smp_call_function_queue() This ensures flush_smp_call_function_queue() is strictly about call_single_queue. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161907.895109676@infradead.org --- kernel/smp.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index db2f73808db5..f720e38e880d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -84,6 +84,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * still pending. */ flush_smp_call_function_queue(false); + irq_work_run(); return 0; } @@ -191,6 +192,14 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, void generic_smp_call_function_single_interrupt(void) { flush_smp_call_function_queue(true); + + /* + * Handle irq works queued remotely by irq_work_queue_on(). + * Smp functions above are typically synchronous so they + * better run first since some other CPUs may be busy waiting + * for them. + */ + irq_work_run(); } /** @@ -267,14 +276,6 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd_unlock(csd); func(info); } - - /* - * Handle irq works queued remotely by irq_work_queue_on(). - * Smp functions above are typically synchronous so they - * better run first since some other CPUs may be busy waiting - * for them. - */ - irq_work_run(); } /* -- cgit v1.2.3 From b2a02fc43a1f40ef4eb2fb2b06357382608d4d84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:01 +0200 Subject: smp: Optimize send_call_function_single_ipi() Just like the ttwu_queue_remote() IPI, make use of _TIF_POLLING_NRFLAG to avoid sending IPIs to idle CPUs. [ mingo: Fix UP build bug. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161907.953304789@infradead.org --- kernel/sched/core.c | 10 ++++++++++ kernel/sched/idle.c | 5 +++++ kernel/sched/sched.h | 7 ++++--- kernel/smp.c | 16 +++++++++++++++- 4 files changed, 34 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2cacc1e44a84..fa0d4990618e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2296,6 +2296,16 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } +void send_call_function_single_ipi(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!set_nr_if_polling(rq->idle)) + arch_send_call_function_single_ipi(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + /* * Queue a task on the target CPUs wake_list and wake the CPU via IPI if * necessary. The wakee CPU on receipt of the IPI will queue the task diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b743bf38f08f..387fd75ee6f7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -289,6 +289,11 @@ static void do_idle(void) */ smp_mb__after_atomic(); + /* + * RCU relies on this call to be done outside of an RCU read-side + * critical section. + */ + flush_smp_call_function_from_idle(); sched_ttwu_pending(); schedule_idle(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3c163cb5493f..75b062999c43 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1506,11 +1506,12 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -#else +extern void flush_smp_call_function_from_idle(void); +#else /* !CONFIG_SMP: */ +static inline void flush_smp_call_function_from_idle(void) { } static inline void sched_ttwu_pending(void) { } - -#endif /* CONFIG_SMP */ +#endif #include "stats.h" #include "autogroup.h" diff --git a/kernel/smp.c b/kernel/smp.c index f720e38e880d..9f1181375141 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -135,6 +135,8 @@ static __always_inline void csd_unlock(call_single_data_t *csd) static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); +extern void send_call_function_single_ipi(int cpu); + /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have @@ -178,7 +180,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, * equipped to do the right thing... */ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - arch_send_call_function_single_ipi(cpu); + send_call_function_single_ipi(cpu); return 0; } @@ -278,6 +280,18 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } } +void flush_smp_call_function_from_idle(void) +{ + unsigned long flags; + + if (llist_empty(this_cpu_ptr(&call_single_queue))) + return; + + local_irq_save(flags); + flush_smp_call_function_queue(true); + local_irq_restore(flags); +} + /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. -- cgit v1.2.3 From 4b44a21dd640b692d4e9b12d3e37c24825f90baa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:02 +0200 Subject: irq_work, smp: Allow irq_work on call_single_queue Currently irq_work_queue_on() will issue an unconditional arch_send_call_function_single_ipi() and has the handler do irq_work_run(). This is unfortunate in that it makes the IPI handler look at a second cacheline and it misses the opportunity to avoid the IPI. Instead note that struct irq_work and struct __call_single_data are very similar in layout, so use a few bits in the flags word to encode a type and stick the irq_work on the call_single_queue list. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161908.011635912@infradead.org --- kernel/irq_work.c | 53 +++++++++++++----------- kernel/smp.c | 119 +++++++++++++++++++++++++++++++++--------------------- 2 files changed, 102 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 48b5d1b6af4d..eca83965b631 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work) { int oflags; - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); /* * If the work is already pending, no need to raise the IPI. * The pairing atomic_fetch_andnot() in irq_work_run() makes sure @@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) - arch_send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &work->llnode); } else { __irq_work_queue_local(work); } @@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void) return true; } +void irq_work_single(void *arg) +{ + struct irq_work *work = arg; + int flags; + + /* + * Clear the PENDING bit, after this point the @work + * can be re-used. + * Make it immediately visible so that other CPUs trying + * to claim that work don't rely on us to handle their data + * while we are in the middle of the func. + */ + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + + lockdep_irq_work_enter(work); + work->func(work); + lockdep_irq_work_exit(work); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); +} + static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; @@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry_safe(work, tmp, llnode, llnode) { - int flags; - /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - * Make it immediately visible so that other CPUs trying - * to claim that work don't rely on us to handle their data - * while we are in the middle of the func. - */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); - - lockdep_irq_work_enter(work); - work->func(work); - lockdep_irq_work_exit(work); - /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. - */ - flags &= ~IRQ_WORK_PENDING; - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); - } + llist_for_each_entry_safe(work, tmp, llnode, llnode) + irq_work_single(work); } /* diff --git a/kernel/smp.c b/kernel/smp.c index 9f1181375141..856562b80794 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -23,10 +23,8 @@ #include "smpboot.h" -enum { - CSD_FLAG_LOCK = 0x01, - CSD_FLAG_SYNCHRONOUS = 0x02, -}; + +#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; @@ -137,15 +135,33 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); extern void send_call_function_single_ipi(int cpu); +void __smp_call_single_queue(int cpu, struct llist_node *node) +{ + /* + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... + */ + if (llist_add(node, &per_cpu(call_single_queue, cpu))) + send_call_function_single_ipi(cpu); +} + /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd, - smp_call_func_t func, void *info) +static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { + smp_call_func_t func = csd->func; + void *info = csd->info; unsigned long flags; /* @@ -159,28 +175,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, return 0; } - if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } - csd->func = func; - csd->info = info; - - /* - * The list addition should be visible before sending the IPI - * handler locks the list to pull the entry off it because of - * normal cache coherency rules implied by spinlocks. - * - * If IPIs can go out of order to the cache coherency protocol - * in an architecture, sufficient synchronisation should be added - * to arch code to make it appear to obey cache coherency WRT - * locking and barrier primitives. Generic code isn't really - * equipped to do the right thing... - */ - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &csd->llist); return 0; } @@ -194,16 +194,10 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, void generic_smp_call_function_single_interrupt(void) { flush_smp_call_function_queue(true); - - /* - * Handle irq works queued remotely by irq_work_queue_on(). - * Smp functions above are typically synchronous so they - * better run first since some other CPUs may be busy waiting - * for them. - */ - irq_work_run(); } +extern void irq_work_single(void *); + /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * @@ -241,9 +235,21 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ - llist_for_each_entry(csd, entry, llist) - pr_warn("IPI callback %pS sent to offline CPU\n", - csd->func); + llist_for_each_entry(csd, entry, llist) { + switch (CSD_TYPE(csd)) { + case CSD_TYPE_ASYNC: + case CSD_TYPE_SYNC: + case CSD_TYPE_IRQ_WORK: + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + break; + + default: + pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", + CSD_TYPE(csd)); + break; + } + } } /* @@ -251,16 +257,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; - /* Do we wait until *after* callback? */ - if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + if (prev) { prev->next = &csd_next->llist; } else { entry = &csd_next->llist; } + func(info); csd_unlock(csd); } else { @@ -272,11 +279,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * Second; run all !SYNC callbacks. */ llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; + int type = CSD_TYPE(csd); - csd_unlock(csd); - func(info); + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } } } @@ -305,7 +318,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, { call_single_data_t *csd; call_single_data_t csd_stack = { - .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }; int this_cpu; int err; @@ -339,7 +352,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd_lock(csd); } - err = generic_exec_single(cpu, csd, func, info); + csd->func = func; + csd->info = info; + + err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); @@ -385,7 +401,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) csd->flags = CSD_FLAG_LOCK; smp_wmb(); - err = generic_exec_single(cpu, csd, csd->func, csd->info); + err = generic_exec_single(cpu, csd); out: preempt_enable(); @@ -500,7 +516,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd_lock(csd); if (wait) - csd->flags |= CSD_FLAG_SYNCHRONOUS; + csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) @@ -632,6 +648,17 @@ void __init smp_init(void) { int num_nodes, num_cpus; + /* + * Ensure struct irq_work layout matches so that + * flush_smp_call_function_queue() can do horrible things. + */ + BUILD_BUG_ON(offsetof(struct irq_work, llnode) != + offsetof(struct __call_single_data, llist)); + BUILD_BUG_ON(offsetof(struct irq_work, func) != + offsetof(struct __call_single_data, func)); + BUILD_BUG_ON(offsetof(struct irq_work, flags) != + offsetof(struct __call_single_data, flags)); + idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From 126c2092e5c8b28623cb890cd2930aa292410676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:03 +0200 Subject: sched: Add rq::ttwu_pending In preparation of removing rq->wake_list, replace the !list_empty(rq->wake_list) with rq->ttwu_pending. This is not fully equivalent as this new variable is racy. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161908.070399698@infradead.org --- kernel/sched/core.c | 13 +++++++++++-- kernel/sched/debug.c | 1 - kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 +++- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa0d4990618e..b71ed5ee97fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2275,13 +2275,21 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) void sched_ttwu_pending(void) { struct rq *rq = this_rq(); - struct llist_node *llist = llist_del_all(&rq->wake_list); + struct llist_node *llist; struct task_struct *p, *t; struct rq_flags rf; + llist = llist_del_all(&rq->wake_list); if (!llist) return; + /* + * rq::ttwu_pending racy indication of out-standing wakeups. + * Races such that false-negatives are possible, since they + * are shorter lived that false-positives would be. + */ + WRITE_ONCE(rq->ttwu_pending, 0); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -2318,6 +2326,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); + WRITE_ONCE(rq->ttwu_pending, 1); if (llist_add(&p->wake_entry, &rq->wake_list)) { if (!set_nr_if_polling(rq->idle)) smp_call_function_single_async(cpu, &rq->wake_csd); @@ -4705,7 +4714,7 @@ int idle_cpu(int cpu) return 0; #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1c24a6bbdae2..36c54265bb2b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -638,7 +638,6 @@ do { \ P(nr_running); P(nr_switches); - P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2890bd54a088..0ed04d2a8959 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8590,7 +8590,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p) */ #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 75b062999c43..c86fc94c54e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -895,7 +895,9 @@ struct rq { atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - unsigned long nr_load_updates; +#ifdef CONFIG_SMP + unsigned int ttwu_pending; +#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK -- cgit v1.2.3 From a148866489fbe243c936fe43e4525d8dbfa0318f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:04 +0200 Subject: sched: Replace rq::wake_list The recent commit: 90b5363acd47 ("sched: Clean up scheduler_ipi()") got smp_call_function_single_async() subtly wrong. Even though it will return -EBUSY when trying to re-use a csd, that condition is not atomic and still requires external serialization. The change in ttwu_queue_remote() got this wrong. While on first reading ttwu_queue_remote() has an atomic test-and-set that appears to serialize the use, the matching 'release' is not in the right place to actually guarantee this serialization. The actual race is vs the sched_ttwu_pending() call in the idle loop; that can run the wakeup-list without consuming the CSD. Instead of trying to chain the lists, merge them. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161908.129371594@infradead.org --- kernel/sched/core.c | 25 +++++++------------------ kernel/sched/idle.c | 1 - kernel/sched/sched.h | 8 -------- kernel/smp.c | 47 ++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 47 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b71ed5ee97fd..b3c64c6b4d2e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1538,7 +1538,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - sched_ttwu_pending(); + flush_smp_call_function_from_idle(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -2272,14 +2272,13 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -void sched_ttwu_pending(void) +void sched_ttwu_pending(void *arg) { + struct llist_node *llist = arg; struct rq *rq = this_rq(); - struct llist_node *llist; struct task_struct *p, *t; struct rq_flags rf; - llist = llist_del_all(&rq->wake_list); if (!llist) return; @@ -2299,11 +2298,6 @@ void sched_ttwu_pending(void) rq_unlock_irqrestore(rq, &rf); } -static void wake_csd_func(void *info) -{ - sched_ttwu_pending(); -} - void send_call_function_single_ipi(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -2327,12 +2321,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - if (llist_add(&p->wake_entry, &rq->wake_list)) { - if (!set_nr_if_polling(rq->idle)) - smp_call_function_single_async(cpu, &rq->wake_csd); - else - trace_sched_wake_idle_without_ipi(cpu); - } + __smp_call_single_queue(cpu, &p->wake_entry); } void wake_up_if_idle(int cpu) @@ -2772,6 +2761,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); +#ifdef CONFIG_SMP + p->wake_entry_type = CSD_TYPE_TTWU; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -6564,7 +6556,6 @@ int sched_cpu_dying(unsigned int cpu) struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ - sched_ttwu_pending(); sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); @@ -6763,8 +6754,6 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; - rq_csd_init(rq, &rq->wake_csd, wake_csd_func); - INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 387fd75ee6f7..05deb81bb3e3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -294,7 +294,6 @@ static void do_idle(void) * critical section. */ flush_smp_call_function_from_idle(); - sched_ttwu_pending(); schedule_idle(); if (unlikely(klp_patch_pending(current))) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c86fc94c54e5..1d4e94c1e5fe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1023,11 +1023,6 @@ struct rq { unsigned int ttwu_local; #endif -#ifdef CONFIG_SMP - call_single_data_t wake_csd; - struct llist_head wake_list; -#endif - #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; @@ -1371,8 +1366,6 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -extern void sched_ttwu_pending(void); - #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -1512,7 +1505,6 @@ extern void flush_smp_call_function_from_idle(void); #else /* !CONFIG_SMP: */ static inline void flush_smp_call_function_from_idle(void) { } -static inline void sched_ttwu_pending(void) { } #endif #include "stats.h" diff --git a/kernel/smp.c b/kernel/smp.c index 856562b80794..0d61dc060b01 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -196,6 +196,7 @@ void generic_smp_call_function_single_interrupt(void) flush_smp_call_function_queue(true); } +extern void sched_ttwu_pending(void *); extern void irq_work_single(void *); /** @@ -244,6 +245,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd->func); break; + case CSD_TYPE_TTWU: + pr_warn("IPI task-wakeup sent to offline CPU\n"); + break; + default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); @@ -275,22 +280,43 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } } + if (!entry) + return; + /* * Second; run all !SYNC callbacks. */ + prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { int type = CSD_TYPE(csd); - if (type == CSD_TYPE_ASYNC) { - smp_call_func_t func = csd->func; - void *info = csd->info; + if (type != CSD_TYPE_TTWU) { + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } - csd_unlock(csd); - func(info); - } else if (type == CSD_TYPE_IRQ_WORK) { - irq_work_single(csd); + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } + + } else { + prev = &csd->llist; } } + + /* + * Third; only CSD_TYPE_TTWU is left, issue those. + */ + if (entry) + sched_ttwu_pending(entry); } void flush_smp_call_function_from_idle(void) @@ -659,6 +685,13 @@ void __init smp_init(void) BUILD_BUG_ON(offsetof(struct irq_work, flags) != offsetof(struct __call_single_data, flags)); + /* + * Assert the CSD_TYPE_TTWU layout is similar enough + * for task_struct to be on the @call_single_queue. + */ + BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != + offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); + idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From 1f8db4150536431b031585ecc2a6793f69245de2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 May 2020 11:01:34 +0200 Subject: sched/headers: Split out open-coded prototypes into kernel/sched/smp.h Move the prototypes for sched_ttwu_pending() and send_call_function_single_ipi() into the newly created kernel/sched/smp.h header, to make sure they are all the same, and to architectures happy that use -Wmissing-prototypes. Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/smp.h | 9 +++++++++ kernel/smp.c | 5 +---- 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 kernel/sched/smp.h (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b3c64c6b4d2e..43ba2d4a8eca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -20,6 +20,7 @@ #include "../smpboot.h" #include "pelt.h" +#include "smp.h" #define CREATE_TRACE_POINTS #include diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h new file mode 100644 index 000000000000..9620e323162c --- /dev/null +++ b/kernel/sched/smp.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal SMP callback types and methods between the scheduler + * and other internal parts of the core kernel: + */ + +extern void sched_ttwu_pending(void *arg); + +extern void send_call_function_single_ipi(int cpu); diff --git a/kernel/smp.c b/kernel/smp.c index 0d61dc060b01..4dec04f7fdc5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -22,7 +22,7 @@ #include #include "smpboot.h" - +#include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) @@ -133,8 +133,6 @@ static __always_inline void csd_unlock(call_single_data_t *csd) static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); -extern void send_call_function_single_ipi(int cpu); - void __smp_call_single_queue(int cpu, struct llist_node *node) { /* @@ -196,7 +194,6 @@ void generic_smp_call_function_single_interrupt(void) flush_smp_call_function_queue(true); } -extern void sched_ttwu_pending(void *); extern void irq_work_single(void *); /** -- cgit v1.2.3 From 1d0326f352bb094771df17f045bdbadff89a43e6 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 14 May 2020 02:25:55 +0200 Subject: genirq: Check irq_data_get_irq_chip() return value before use irq_data_get_irq_chip() can return NULL, however it is expected that this never happens. If a buggy driver leads to NULL being returned from irq_data_get_irq_chip(), warn about it instead of crashing the machine. Signed-off-by: Marek Vasut Signed-off-by: Thomas Gleixner To: linux-arm-kernel@lists.infradead.org --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 453a8a0f4804..761911168438 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2619,6 +2619,8 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2696,6 +2698,8 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -- cgit v1.2.3 From 936f2a70f2077f64fab1dcb3eca71879e82ecd3f Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 27 May 2020 14:43:19 -0700 Subject: cgroup: add cpu.stat file to root cgroup Currently, the root cgroup does not have a cpu.stat file. Add one which is consistent with /proc/stat to capture global cpu statistics that might not fall under cgroup accounting. We haven't done this in the past because the data are already presented in /proc/stat and we didn't want to add overhead from collecting root cgroup stats when cgroups are configured, but no cgroups have been created. By keeping the data consistent with /proc/stat, I think we avoid the first problem, while improving the usability of cgroups stats. We avoid the second problem by computing the contents of cpu.stat from existing data collected for /proc/stat anyway. Signed-off-by: Boris Burkov Suggested-by: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 - kernel/cgroup/rstat.c | 60 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7a016749de21..51924ebdff51 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4874,7 +4874,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, #ifdef CONFIG_PSI diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 41ca996568df..b6397a186ce9 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -389,18 +389,62 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, cgroup_base_stat_cputime_account_end(cgrp, rstatc); } +/* + * compute the cputime for the root cgroup by getting the per cpu data + * at a global level, then categorizing the fields in a manner consistent + * with how it is done by __cgroup_account_cputime_field for each bit of + * cpu time attributed to a cgroup. + */ +static void root_cgroup_cputime(struct task_cputime *cputime) +{ + int i; + + cputime->stime = 0; + cputime->utime = 0; + cputime->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + u64 user = 0; + u64 sys = 0; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + user += cpustat[CPUTIME_NICE]; + cputime->utime += user; + + sys += cpustat[CPUTIME_SYSTEM]; + sys += cpustat[CPUTIME_IRQ]; + sys += cpustat[CPUTIME_SOFTIRQ]; + cputime->stime += sys; + + cputime->sum_exec_runtime += user; + cputime->sum_exec_runtime += sys; + cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; + } +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; - - if (!cgroup_parent(cgrp)) - return; - - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); - cgroup_rstat_flush_release(); + struct task_cputime cputime; + + if (cgroup_parent(cgrp)) { + cgroup_rstat_flush_hold(cgrp); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, + &utime, &stime); + cgroup_rstat_flush_release(); + } else { + root_cgroup_cputime(&cputime); + usage = cputime.sum_exec_runtime; + utime = cputime.utime; + stime = cputime.stime; + } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); -- cgit v1.2.3 From d8bb65ab70f702531aaaa11d9710f9450078e295 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 May 2020 21:46:32 +0200 Subject: workqueue: Use rcuwait for wq_manager_wait The workqueue code has it's internal spinlock (pool::lock) and also implicit spinlock usage in the wq_manager waitqueue. These spinlocks are converted to 'sleeping' spinlocks on a RT-kernel. Workqueue functions can be invoked from contexts which are truly atomic even on a PREEMPT_RT enabled kernel. Taking sleeping locks from such contexts is forbidden. pool::lock can be converted to a raw spinlock as the lock held times are short. But the workqueue manager waitqueue is handled inside of pool::lock held regions which again violates the lock nesting rules of raw and regular spinlocks. The manager waitqueue has no special requirements like custom wakeup callbacks or mass wakeups. While it does not use exclusive wait mode explicitly there is no strict requirement to queue the waiters in a particular order as there is only one waiter at a time. This allows to replace the waitqueue with rcuwait which solves the locking problem because rcuwait relies on existing locking. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7a1fc9fe6314..7c3566f8e4ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -301,7 +301,8 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ +/* wait for manager to go away */ +static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -2140,7 +2141,7 @@ static bool manage_workers(struct worker *worker) pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; - wake_up(&wq_manager_wait); + rcuwait_wake_up(&manager_wait); return true; } @@ -3503,6 +3504,18 @@ static void rcu_free_pool(struct rcu_head *rcu) kfree(pool); } +/* This returns with the lock held on success (pool manager is inactive). */ +static bool wq_manager_inactive(struct worker_pool *pool) +{ + spin_lock_irq(&pool->lock); + + if (pool->flags & POOL_MANAGER_ACTIVE) { + spin_unlock_irq(&pool->lock); + return false; + } + return true; +} + /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put @@ -3538,10 +3551,11 @@ static void put_unbound_pool(struct worker_pool *pool) * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. + * Because of how wq_manager_inactive() works, we will hold the + * spinlock after a successful wait. */ - spin_lock_irq(&pool->lock); - wait_event_lock_irq(wq_manager_wait, - !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), + TASK_UNINTERRUPTIBLE); pool->flags |= POOL_MANAGER_ACTIVE; while ((worker = first_idle_worker(pool))) -- cgit v1.2.3 From a9b8a985294debae00f6c087dfec8c384d30a3b9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 May 2020 21:46:33 +0200 Subject: workqueue: Convert the pool::lock and wq_mayday_lock to raw_spinlock_t The workqueue code has it's internal spinlocks (pool::lock), which are acquired on most workqueue operations. These spinlocks are converted to 'sleeping' spinlocks on a RT-kernel. Workqueue functions can be invoked from contexts which are truly atomic even on a PREEMPT_RT enabled kernel. Taking sleeping locks from such contexts is forbidden. The pool::lock hold times are bound and the code sections are relatively short, which allows to convert pool::lock and as a consequence wq_mayday_lock to raw spinlocks which are truly spinning locks even on a PREEMPT_RT kernel. With the previous conversion of the manager waitqueue to a simple waitqueue workqueues are now fully RT compliant. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 176 ++++++++++++++++++++++++++--------------------------- 1 file changed, 88 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7c3566f8e4ca..82f85f5d81a8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -145,7 +145,7 @@ enum { /* struct worker is defined in workqueue_internal.h */ struct worker_pool { - spinlock_t lock; /* the pool lock */ + raw_spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ @@ -300,7 +300,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ -static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ /* wait for manager to go away */ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); @@ -827,7 +827,7 @@ static struct worker *first_idle_worker(struct worker_pool *pool) * Wake up the first idle worker of @pool. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void wake_up_worker(struct worker_pool *pool) { @@ -882,7 +882,7 @@ void wq_worker_sleeping(struct task_struct *task) return; worker->sleeping = 1; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, @@ -901,7 +901,7 @@ void wq_worker_sleeping(struct task_struct *task) if (next) wake_up_process(next->task); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** @@ -912,7 +912,7 @@ void wq_worker_sleeping(struct task_struct *task) * the scheduler to get a worker's last known identity. * * CONTEXT: - * spin_lock_irq(rq->lock) + * raw_spin_lock_irq(rq->lock) * * This function is called during schedule() when a kworker is going * to sleep. It's used by psi to identify aggregation workers during @@ -943,7 +943,7 @@ work_func_t wq_worker_last_func(struct task_struct *task) * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { @@ -968,7 +968,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * Clear @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { @@ -1016,7 +1016,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL @@ -1051,7 +1051,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * nested inside outer list_for_each_entry_safe(). * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) @@ -1129,9 +1129,9 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } } @@ -1164,7 +1164,7 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { @@ -1263,7 +1263,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!pool) goto fail; - spin_lock(&pool->lock); + raw_spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point @@ -1292,11 +1292,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; } - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); local_irq_restore(*flags); @@ -1317,7 +1317,7 @@ fail: * work_struct flags. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) @@ -1434,7 +1434,7 @@ retry: if (last_pool && last_pool != pwq->pool) { struct worker *worker; - spin_lock(&last_pool->lock); + raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); @@ -1442,11 +1442,11 @@ retry: pwq = worker->current_pwq; } else { /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); + raw_spin_unlock(&last_pool->lock); + raw_spin_lock(&pwq->pool->lock); } } else { - spin_lock(&pwq->pool->lock); + raw_spin_lock(&pwq->pool->lock); } /* @@ -1459,7 +1459,7 @@ retry: */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); cpu_relax(); goto retry; } @@ -1491,7 +1491,7 @@ retry: insert_work(pwq, work, worklist, work_flags); out: - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); rcu_read_unlock(); } @@ -1760,7 +1760,7 @@ EXPORT_SYMBOL(queue_rcu_work); * necessary. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { @@ -1800,7 +1800,7 @@ static void worker_enter_idle(struct worker *worker) * @worker is leaving idle state. Update stats. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { @@ -1938,11 +1938,11 @@ static struct worker *create_worker(struct worker_pool *pool) worker_attach_to_pool(worker, pool); /* start the newly created worker */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); return worker; @@ -1961,7 +1961,7 @@ fail: * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { @@ -1987,7 +1987,7 @@ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; @@ -2005,7 +2005,7 @@ static void idle_worker_timeout(struct timer_list *t) destroy_worker(worker); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } static void send_mayday(struct work_struct *work) @@ -2036,8 +2036,8 @@ static void pool_mayday_timeout(struct timer_list *t) struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; - spin_lock_irq(&pool->lock); - spin_lock(&wq_mayday_lock); /* for wq->maydays */ + raw_spin_lock_irq(&pool->lock); + raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* @@ -2050,8 +2050,8 @@ static void pool_mayday_timeout(struct timer_list *t) send_mayday(work); } - spin_unlock(&wq_mayday_lock); - spin_unlock_irq(&pool->lock); + raw_spin_unlock(&wq_mayday_lock); + raw_spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2070,7 +2070,7 @@ static void pool_mayday_timeout(struct timer_list *t) * may_start_working() %true. * * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ @@ -2079,7 +2079,7 @@ __releases(&pool->lock) __acquires(&pool->lock) { restart: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); @@ -2095,7 +2095,7 @@ restart: } del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have @@ -2118,7 +2118,7 @@ restart: * and may_start_working() is true. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: @@ -2157,7 +2157,7 @@ static bool manage_workers(struct worker *worker) * call this function to process a work. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * raw_spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) @@ -2239,7 +2239,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); @@ -2294,7 +2294,7 @@ __acquires(&pool->lock) */ cond_resched(); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* clear cpu intensive status */ if (unlikely(cpu_intensive)) @@ -2320,7 +2320,7 @@ __acquires(&pool->lock) * fetches a work from the top and executes it. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) @@ -2362,11 +2362,11 @@ static int worker_thread(void *__worker) /* tell the scheduler that this is a workqueue worker */ set_pf_worker(true); woke_up: - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); set_pf_worker(false); @@ -2432,7 +2432,7 @@ sleep: */ worker_enter_idle(worker); __set_current_state(TASK_IDLE); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } @@ -2486,7 +2486,7 @@ repeat: should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, @@ -2498,11 +2498,11 @@ repeat: __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * Slurp in all works issued via this workqueue and @@ -2531,7 +2531,7 @@ repeat: * incur MAYDAY_INTERVAL delay inbetween. */ if (need_to_create_worker(pool)) { - spin_lock(&wq_mayday_lock); + raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction * and somebody else hasn't queued it already. @@ -2540,7 +2540,7 @@ repeat: get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } - spin_unlock(&wq_mayday_lock); + raw_spin_unlock(&wq_mayday_lock); } } @@ -2558,14 +2558,14 @@ repeat: if (need_more_worker(pool)) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer); - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); } - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); @@ -2645,7 +2645,7 @@ static void wq_barrier_func(struct work_struct *work) * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, @@ -2732,7 +2732,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2749,7 +2749,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) @@ -2949,9 +2949,9 @@ reflush: for_each_pwq(pwq, wq) { bool drained; - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2987,7 +2987,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, return false; } - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3003,7 +3003,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, check_flush_dependency(pwq->wq, work); insert_wq_barrier(pwq, barr, work, worker); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* * Force a lock recursion deadlock when using flush_work() inside a @@ -3022,7 +3022,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, rcu_read_unlock(); return true; already_gone: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); rcu_read_unlock(); return false; } @@ -3415,7 +3415,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, */ static int init_worker_pool(struct worker_pool *pool) { - spin_lock_init(&pool->lock); + raw_spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; @@ -3507,10 +3507,10 @@ static void rcu_free_pool(struct rcu_head *rcu) /* This returns with the lock held on success (pool manager is inactive). */ static bool wq_manager_inactive(struct worker_pool *pool) { - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); if (pool->flags & POOL_MANAGER_ACTIVE) { - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); return false; } return true; @@ -3561,7 +3561,7 @@ static void put_unbound_pool(struct worker_pool *pool) while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_lock(&wq_pool_attach_mutex); if (!list_empty(&pool->workers)) @@ -3717,7 +3717,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) return; /* this function can be called during early boot w/ irq disabled */ - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that @@ -3740,7 +3740,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ @@ -4142,9 +4142,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, use_dfl_pwq: mutex_lock(&wq->mutex); - spin_lock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); - spin_unlock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); @@ -4373,9 +4373,9 @@ void destroy_workqueue(struct workqueue_struct *wq) struct worker *rescuer = wq->rescuer; /* this prevents new queueing */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); wq->rescuer = NULL; - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); @@ -4389,18 +4389,18 @@ void destroy_workqueue(struct workqueue_struct *wq) mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { pr_warn("%s: %s has the following busy pwq\n", __func__, wq->name); show_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); show_workqueue_state(); return; } - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); @@ -4571,10 +4571,10 @@ unsigned int work_busy(struct work_struct *work) rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); } rcu_read_unlock(); @@ -4781,10 +4781,10 @@ void show_workqueue_state(void) pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4798,7 +4798,7 @@ void show_workqueue_state(void) struct worker *worker; bool first = true; - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; @@ -4817,7 +4817,7 @@ void show_workqueue_state(void) } pr_cont("\n"); next_pool: - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4847,7 +4847,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) struct worker_pool *pool = worker->pool; if (pool) { - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * ->desc tracks information (wq name or * set_worker_desc()) for the latest execution. If @@ -4861,7 +4861,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) scnprintf(buf + off, size - off, "-%s", worker->desc); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4892,7 +4892,7 @@ static void unbind_workers(int cpu) for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&wq_pool_attach_mutex); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers @@ -4906,7 +4906,7 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_unlock(&wq_pool_attach_mutex); /* @@ -4932,9 +4932,9 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4961,7 +4961,7 @@ static void rebind_workers(struct worker_pool *pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; @@ -5000,7 +5000,7 @@ static void rebind_workers(struct worker_pool *pool) WRITE_ONCE(worker->flags, worker_flags); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** -- cgit v1.2.3 From 4f3f4cf388f8fda7ee8ea7c6af0ff0ebb2d05fe4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 06:58:59 +0000 Subject: workqueue: void unneeded requeuing the pwq in rescuer thread 008847f66c3 ("workqueue: allow rescuer thread to do more work.") made the rescuer worker requeue the pwq immediately if there may be more work items which need rescuing instead of waiting for the next mayday timer expiration. Unfortunately, it checks only whether the pool needs help from rescuers, but it doesn't check whether the pwq has work items in the pool (the real reason that this rescuer can help for the pool). The patch adds the check and void unneeded requeuing. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82f85f5d81a8..6feefc65d332 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2530,7 +2530,7 @@ repeat: * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ - if (need_to_create_worker(pool)) { + if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction -- cgit v1.2.3 From b8f06b0444ec146e3ae98caac8039c77e5308ce2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 06:59:02 +0000 Subject: workqueue: remove useless unlock() and lock() in series This is no point to unlock() and then lock() the same mutex back to back. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6feefc65d332..c667ca5aed61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4403,13 +4403,11 @@ void destroy_workqueue(struct workqueue_struct *wq) raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - mutex_lock(&wq_pool_mutex); list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); -- cgit v1.2.3 From 6d3cf962dd1a95df868c547b090bfc4c7977f4be Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 May 2020 11:05:43 -0700 Subject: printk: Collapse shutdown types into a single dump reason To turn the KMSG_DUMP_* reasons into a more ordered list, collapse the redundant KMSG_DUMP_(RESTART|HALT|POWEROFF) reasons into KMSG_DUMP_SHUTDOWN. The current users already don't meaningfully distinguish between them, so there's no need to, as discussed here: https://lore.kernel.org/lkml/CA+CK2bAPv5u1ih5y9t5FUnTyximtFCtDYXJCpuyjOyHNOkRdqw@mail.gmail.com/ Link: https://lore.kernel.org/lkml/20200515184434.8470-2-keescook@chromium.org/ Reviewed-by: Pavel Tatashin Reviewed-by: Petr Mladek Acked-by: Michael Ellerman (powerpc) Signed-off-by: Kees Cook --- kernel/reboot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index c4d472b7f1b4..491f1347bf43 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -250,7 +250,7 @@ void kernel_restart(char *cmd) pr_emerg("Restarting system\n"); else pr_emerg("Restarting system with command '%s'\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -274,7 +274,7 @@ void kernel_halt(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("System halted\n"); - kmsg_dump(KMSG_DUMP_HALT); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); @@ -292,7 +292,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); -- cgit v1.2.3 From b1f6f161b236d0e5a9222fb8b482e65aaff13689 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 5 May 2020 11:45:06 -0400 Subject: printk: honor the max_reason field in kmsg_dumper kmsg_dump() allows to dump kmesg buffer for various system events: oops, panic, reboot, etc. It provides an interface to register a callback call for clients, and in that callback interface there is a field "max_reason", but it was getting ignored when set to any "reason" higher than KMSG_DUMP_OOPS unless "always_kmsg_dump" was passed as kernel parameter. Allow clients to actually control their "max_reason", and keep the current behavior when "max_reason" is not set. Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-3-keescook@chromium.org/ Reviewed-by: Petr Mladek Signed-off-by: Kees Cook --- kernel/printk/printk.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..a121c2255737 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3157,12 +3157,19 @@ void kmsg_dump(enum kmsg_dump_reason reason) struct kmsg_dumper *dumper; unsigned long flags; - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { - if (dumper->max_reason && reason > dumper->max_reason) + enum kmsg_dump_reason max_reason = dumper->max_reason; + + /* + * If client has not provided a specific max_reason, default + * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. + */ + if (max_reason == KMSG_DUMP_UNDEF) { + max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : + KMSG_DUMP_OOPS; + } + if (reason > max_reason) continue; /* initialize iterator with data about the stored records */ -- cgit v1.2.3 From fb13cb8a0482105a415e24042209d02a684255e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 May 2020 19:36:22 -0700 Subject: printk: Introduce kmsg_dump_reason_str() The pstore subsystem already had a private version of this function. With the coming addition of the pstore/zone driver, this needs to be shared. As it really should live with printk, move it there instead. Link: https://lore.kernel.org/lkml/20200515184434.8470-4-keescook@chromium.org/ Acked-by: Petr Mladek Acked-by: Sergey Senozhatsky Reviewed-by: Pavel Tatashin Signed-off-by: Kees Cook --- kernel/printk/printk.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a121c2255737..14ca4d05d902 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3144,6 +3144,23 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; + default: + return "Unknown"; + } +} +EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping -- cgit v1.2.3 From 1b94b3aed367ff2cdc84d325e0aa9d7cc9e3cf2a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 3 Apr 2020 14:31:19 -0500 Subject: tracing: Check state.disabled in synth event trace functions Since trace_state.disabled is set in __synth_event_trace_start() at the same time -ENOENT is returned, don't bother returning -ENOENT - just have callers check trace_state.disabled instead, and avoid the extra return val munging. Link: http://lkml.kernel.org/r/87315c3889af870e8370e82b76cf48b426d70130.1585941485.git.zanussi@kernel.org Suggested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fcab11cc6833..5cfe8f998b3e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1845,7 +1845,6 @@ __synth_event_trace_start(struct trace_event_file *file, if (!(file->flags & EVENT_FILE_FL_ENABLED) || trace_trigger_soft_disabled(file)) { trace_state->disabled = true; - ret = -ENOENT; goto out; } @@ -1907,11 +1906,8 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) int ret; ret = __synth_event_trace_start(file, &state); - if (ret) { - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + if (ret || state.disabled) return ret; - } if (n_vals != state.event->n_fields) { ret = -EINVAL; @@ -1987,11 +1983,8 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, int ret; ret = __synth_event_trace_start(file, &state); - if (ret) { - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + if (ret || state.disabled) return ret; - } if (n_vals != state.event->n_fields) { ret = -EINVAL; @@ -2067,16 +2060,10 @@ EXPORT_SYMBOL_GPL(synth_event_trace_array); int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state) { - int ret; - if (!trace_state) return -EINVAL; - ret = __synth_event_trace_start(file, trace_state); - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ - - return ret; + return __synth_event_trace_start(file, trace_state); } EXPORT_SYMBOL_GPL(synth_event_trace_start); -- cgit v1.2.3 From 2d19bd79ae6509858582a9cade739c2e9a4fdca8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 3 Apr 2020 14:31:21 -0500 Subject: tracing: Add hist_debug trace event files for histogram debugging Add a new "hist_debug" file for each trace event, which when read will dump out a bunch of internal details about the hist triggers defined on that event. This is normally off but can be enabled by saying 'y' to the new CONFIG_HIST_TRIGGERS_DEBUG config option. This is in support of the new Documentation file describing histogram internals, Documentation/trace/histogram-design.rst, which was requested by developers trying to understand the internals when extending or making use of the hist triggers for higher-level tools. The histogram-design.rst documentation refers to the hist_debug files and demonstrates their use with output in the test examples. Link: http://lkml.kernel.org/r/77914c22b0ba493d9783c53bbfbc6087d6a7e1b1.1585941485.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 23 ++++ kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 4 + kernel/trace/trace_events_hist.c | 273 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 301 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 743647005f64..2aec9376825f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -847,6 +847,29 @@ config KPROBE_EVENT_GEN_TEST If unsure, say N. +config HIST_TRIGGERS_DEBUG + bool "Hist trigger debug support" + depends on HIST_TRIGGERS + help + Add "hist_debug" file for each event, which when read will + dump out a bunch of internal details about the hist triggers + defined on that event. + + The hist_debug file serves a couple of purposes: + + - Helps developers verify that nothing is broken. + + - Provides educational information to support the details + of the hist trigger internals as described by + Documentation/trace/histogram-design.rst. + + The hist_debug output only covers the data structures + related to the histogram definitions themselves and doesn't + display the internals of map buckets or variable values of + running histograms. + + If unsure, say N. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4eb1d004d5f2..def769df5bf1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1661,6 +1661,7 @@ extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; +extern const struct file_operations event_hist_debug_fops; extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 242f59e7f17d..f6f55682d3e2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2208,6 +2208,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) #ifdef CONFIG_HIST_TRIGGERS trace_create_file("hist", 0444, file->dir, file, &event_hist_fops); +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + trace_create_file("hist_debug", 0444, file->dir, file, + &event_hist_debug_fops); #endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5cfe8f998b3e..313227da4925 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6478,6 +6478,279 @@ const struct file_operations event_hist_fops = { .release = single_release, }; +#ifdef CONFIG_HIST_TRIGGERS_DEBUG +static void hist_field_debug_show_flags(struct seq_file *m, + unsigned long flags) +{ + seq_puts(m, " flags:\n"); + + if (flags & HIST_FIELD_FL_KEY) + seq_puts(m, " HIST_FIELD_FL_KEY\n"); + else if (flags & HIST_FIELD_FL_HITCOUNT) + seq_puts(m, " VAL: HIST_FIELD_FL_HITCOUNT\n"); + else if (flags & HIST_FIELD_FL_VAR) + seq_puts(m, " HIST_FIELD_FL_VAR\n"); + else if (flags & HIST_FIELD_FL_VAR_REF) + seq_puts(m, " HIST_FIELD_FL_VAR_REF\n"); + else + seq_puts(m, " VAL: normal u64 value\n"); + + if (flags & HIST_FIELD_FL_ALIAS) + seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); +} + +static int hist_field_debug_show(struct seq_file *m, + struct hist_field *field, unsigned long flags) +{ + if ((field->flags & flags) != flags) { + seq_printf(m, "ERROR: bad flags - %lx\n", flags); + return -EINVAL; + } + + hist_field_debug_show_flags(m, field->flags); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + + if (field->flags & HIST_FIELD_FL_VAR) { + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + } + + if (field->flags & HIST_FIELD_FL_ALIAS) + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + + if (field->flags & HIST_FIELD_FL_VAR_REF) { + seq_printf(m, " name: %s\n", field->name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + seq_printf(m, " var.hist_data: %p\n", field->var.hist_data); + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + if (field->system) + seq_printf(m, " system: %s\n", field->system); + if (field->event_name) + seq_printf(m, " event_name: %s\n", field->event_name); + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); + + return 0; +} + +static int field_var_debug_show(struct seq_file *m, + struct field_var *field_var, unsigned int i, + bool save_vars) +{ + const char *vars_name = save_vars ? "save_vars" : "field_vars"; + struct hist_field *field; + int ret = 0; + + seq_printf(m, "\n hist_data->%s[%d]:\n", vars_name, i); + + field = field_var->var; + + seq_printf(m, "\n %s[%d].var:\n", vars_name, i); + + hist_field_debug_show_flags(m, field->flags); + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + + field = field_var->val; + + seq_printf(m, "\n %s[%d].val:\n", vars_name, i); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + else { + ret = -EINVAL; + goto out; + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); +out: + return ret; +} + +static int hist_action_debug_show(struct seq_file *m, + struct action_data *data, int i) +{ + int ret = 0; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + seq_printf(m, "\n hist_data->actions[%d].track_data.var_ref:\n", i); + ret = hist_field_debug_show(m, data->track_data.var_ref, + HIST_FIELD_FL_VAR_REF); + if (ret) + goto out; + + seq_printf(m, "\n hist_data->actions[%d].track_data.track_var:\n", i); + ret = hist_field_debug_show(m, data->track_data.track_var, + HIST_FIELD_FL_VAR); + if (ret) + goto out; + } + + if (data->handler == HANDLER_ONMATCH) { + seq_printf(m, "\n hist_data->actions[%d].match_data.event_system: %s\n", + i, data->match_data.event_system); + seq_printf(m, " hist_data->actions[%d].match_data.event: %s\n", + i, data->match_data.event); + } +out: + return ret; +} + +static int hist_actions_debug_show(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + int i, ret = 0; + + if (hist_data->n_actions) + seq_puts(m, "\n action tracking variables (for onmax()/onchange()/onmatch()):\n"); + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *action = hist_data->actions[i]; + + ret = hist_action_debug_show(m, action, i); + if (ret) + goto out; + } + + if (hist_data->n_save_vars) + seq_puts(m, "\n save action variables (save() params):\n"); + + for (i = 0; i < hist_data->n_save_vars; i++) { + ret = field_var_debug_show(m, hist_data->save_vars[i], i, true); + if (ret) + goto out; + } +out: + return ret; +} + +static void hist_trigger_debug_show(struct seq_file *m, + struct event_trigger_data *data, int n) +{ + struct hist_trigger_data *hist_data; + int i, ret; + + if (n > 0) + seq_puts(m, "\n\n"); + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data->ops, data); + seq_puts(m, "#\n\n"); + + hist_data = data->private_data; + + seq_printf(m, "hist_data: %p\n\n", hist_data); + seq_printf(m, " n_vals: %u\n", hist_data->n_vals); + seq_printf(m, " n_keys: %u\n", hist_data->n_keys); + seq_printf(m, " n_fields: %u\n", hist_data->n_fields); + + seq_puts(m, "\n val fields:\n\n"); + + seq_puts(m, " hist_data->fields[0]:\n"); + ret = hist_field_debug_show(m, hist_data->fields[0], + HIST_FIELD_FL_HITCOUNT); + if (ret) + return; + + for (i = 1; i < hist_data->n_vals; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], 0); + if (ret) + return; + } + + seq_puts(m, "\n key fields:\n"); + + for (i = hist_data->n_vals; i < hist_data->n_fields; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], + HIST_FIELD_FL_KEY); + if (ret) + return; + } + + if (hist_data->n_var_refs) + seq_puts(m, "\n variable reference fields:\n"); + + for (i = 0; i < hist_data->n_var_refs; i++) { + seq_printf(m, "\n hist_data->var_refs[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->var_refs[i], + HIST_FIELD_FL_VAR_REF); + if (ret) + return; + } + + if (hist_data->n_field_vars) + seq_puts(m, "\n field variables:\n"); + + for (i = 0; i < hist_data->n_field_vars; i++) { + ret = field_var_debug_show(m, hist_data->field_vars[i], i, false); + if (ret) + return; + } + + ret = hist_actions_debug_show(m, hist_data); + if (ret) + return; +} + +static int hist_debug_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_debug_show(m, data, n++); + } + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_debug_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + return single_open(file, hist_debug_show, file); +} + +const struct file_operations event_hist_debug_fops = { + .open = event_hist_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); -- cgit v1.2.3 From 726721a51838e3983023f906580722fc83f804ee Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 28 May 2020 14:32:37 -0500 Subject: tracing: Move synthetic events to a separate file With the addition of the in-kernel synthetic event API, synthetic events are no longer specifically tied to the histogram triggers. The synthetic event code is also making trace_event_hist.c very bloated, so for those reasons, move it to a separate file, trace_events_synth.c, along with a new trace_synth.h header file. Because synthetic events are now independent from hist triggers, add a new CONFIG_SYNTH_EVENTS config option, and have CONFIG_HIST_TRIGGERS select it, and have CONFIG_SYNTH_EVENT_GEN_TEST depend on it. Link: http://lkml.kernel.org/r/4d1fa1f85ed5982706ac44844ac92451dcb04715.1590693308.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 20 +- kernel/trace/Makefile | 1 + kernel/trace/trace_events_hist.c | 2071 +++---------------------------------- kernel/trace/trace_events_synth.c | 1789 ++++++++++++++++++++++++++++++++ kernel/trace/trace_synth.h | 36 + 5 files changed, 1989 insertions(+), 1928 deletions(-) create mode 100644 kernel/trace/trace_events_synth.c create mode 100644 kernel/trace/trace_synth.h (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2aec9376825f..75407d5dc83a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -623,12 +623,30 @@ config TRACING_MAP generally used outside of that context, and is normally selected by tracers that use it. +config SYNTH_EVENTS + bool "Synthetic trace events" + select TRACING + select DYNAMIC_EVENTS + default n + help + Synthetic events are user-defined trace events that can be + used to combine data from other trace events or in fact any + data source. Synthetic events can be generated indirectly + via the trace() action of histogram triggers or directly + by way of an in-kernel API. + + See Documentation/trace/events.rst or + Documentation/trace/histogram.rst for details and examples. + + If in doubt, say N. + config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP select TRACING select DYNAMIC_EVENTS + select SYNTH_EVENTS default n help Hist triggers allow one or more arbitrary trace event fields @@ -824,7 +842,7 @@ config PREEMPTIRQ_DELAY_TEST config SYNTH_EVENT_GEN_TEST tristate "Test module for in-kernel synthetic event generation" - depends on HIST_TRIGGERS + depends on SYNTH_EVENTS help This option creates a test module to check the base functionality of in-kernel synthetic event definition and diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f9dcd19165fa..1d8aaa546412 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -72,6 +72,7 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o +obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 313227da4925..0b933546142e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -19,13 +19,7 @@ #include #include "tracing_map.h" -#include "trace.h" -#include "trace_dynevent.h" - -#define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 32 - -#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#include "trace_synth.h" #define ERRORS \ C(NONE, "No error"), \ @@ -380,69 +374,6 @@ struct hist_trigger_data { unsigned int n_save_var_str; }; -static int create_synth_event(int argc, const char **argv); -static int synth_event_show(struct seq_file *m, struct dyn_event *ev); -static int synth_event_release(struct dyn_event *ev); -static bool synth_event_is_busy(struct dyn_event *ev); -static bool synth_event_match(const char *system, const char *event, - int argc, const char **argv, struct dyn_event *ev); - -static struct dyn_event_operations synth_event_ops = { - .create = create_synth_event, - .show = synth_event_show, - .is_busy = synth_event_is_busy, - .free = synth_event_release, - .match = synth_event_match, -}; - -struct synth_field { - char *type; - char *name; - size_t size; - unsigned int offset; - bool is_signed; - bool is_string; -}; - -struct synth_event { - struct dyn_event devent; - int ref; - char *name; - struct synth_field **fields; - unsigned int n_fields; - unsigned int n_u64; - struct trace_event_class class; - struct trace_event_call call; - struct tracepoint *tp; - struct module *mod; -}; - -static bool is_synth_event(struct dyn_event *ev) -{ - return ev->ops == &synth_event_ops; -} - -static struct synth_event *to_synth_event(struct dyn_event *ev) -{ - return container_of(ev, struct synth_event, devent); -} - -static bool synth_event_is_busy(struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - - return event->ref != 0; -} - -static bool synth_event_match(const char *system, const char *event, - int argc, const char **argv, struct dyn_event *ev) -{ - struct synth_event *sev = to_synth_event(ev); - - return strcmp(sev->name, event) == 0 && - (!system || strcmp(system, SYNTH_SYSTEM) == 0); -} - struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, @@ -490,1882 +421,202 @@ struct action_data { union { struct { char *event; - char *event_system; - } match_data; - - struct { - /* - * var_str contains the $-unstripped variable - * name referenced by var_ref, and used when - * printing the action. Because var_ref - * creation is deferred to create_actions(), - * we need a per-action way to save it until - * then, thus var_str. - */ - char *var_str; - - /* - * var_ref refers to the variable being - * tracked e.g onmax($var). - */ - struct hist_field *var_ref; - - /* - * track_var contains the 'invisible' tracking - * variable created to keep the current - * e.g. max value. - */ - struct hist_field *track_var; - - check_track_val_fn_t check_val; - action_fn_t save_data; - } track_data; - }; -}; - -struct track_data { - u64 track_val; - bool updated; - - unsigned int key_len; - void *key; - struct tracing_map_elt elt; - - struct action_data *action_data; - struct hist_trigger_data *hist_data; -}; - -struct hist_elt_data { - char *comm; - u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; -}; - -struct snapshot_context { - struct tracing_map_elt *elt; - void *key; -}; - -static void track_data_free(struct track_data *track_data) -{ - struct hist_elt_data *elt_data; - - if (!track_data) - return; - - kfree(track_data->key); - - elt_data = track_data->elt.private_data; - if (elt_data) { - kfree(elt_data->comm); - kfree(elt_data); - } - - kfree(track_data); -} - -static struct track_data *track_data_alloc(unsigned int key_len, - struct action_data *action_data, - struct hist_trigger_data *hist_data) -{ - struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); - struct hist_elt_data *elt_data; - - if (!data) - return ERR_PTR(-ENOMEM); - - data->key = kzalloc(key_len, GFP_KERNEL); - if (!data->key) { - track_data_free(data); - return ERR_PTR(-ENOMEM); - } - - data->key_len = key_len; - data->action_data = action_data; - data->hist_data = hist_data; - - elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); - if (!elt_data) { - track_data_free(data); - return ERR_PTR(-ENOMEM); - } - data->elt.private_data = elt_data; - - elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!elt_data->comm) { - track_data_free(data); - return ERR_PTR(-ENOMEM); - } - - return data; -} - -static char last_cmd[MAX_FILTER_STR_VAL]; -static char last_cmd_loc[MAX_FILTER_STR_VAL]; - -static int errpos(char *str) -{ - return err_pos(last_cmd, str); -} - -static void last_cmd_set(struct trace_event_file *file, char *str) -{ - const char *system = NULL, *name = NULL; - struct trace_event_call *call; - - if (!str) - return; - - strcpy(last_cmd, "hist:"); - strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:")); - - if (file) { - call = file->event_call; - - system = call->class->system; - if (system) { - name = trace_event_name(call); - if (!name) - system = NULL; - } - } - - if (system) - snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); -} - -static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) -{ - tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, - err_type, err_pos); -} - -static void hist_err_clear(void) -{ - last_cmd[0] = '\0'; - last_cmd_loc[0] = '\0'; -} - -struct synth_trace_event { - struct trace_entry ent; - u64 fields[]; -}; - -static int synth_event_define_fields(struct trace_event_call *call) -{ - struct synth_trace_event trace; - int offset = offsetof(typeof(trace), fields); - struct synth_event *event = call->data; - unsigned int i, size, n_u64; - char *name, *type; - bool is_signed; - int ret = 0; - - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - size = event->fields[i]->size; - is_signed = event->fields[i]->is_signed; - type = event->fields[i]->type; - name = event->fields[i]->name; - ret = trace_define_field(call, type, name, offset, size, - is_signed, FILTER_OTHER); - if (ret) - break; - - event->fields[i]->offset = n_u64; - - if (event->fields[i]->is_string) { - offset += STR_VAR_LEN_MAX; - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - offset += sizeof(u64); - n_u64++; - } - } - - event->n_u64 = n_u64; - - return ret; -} - -static bool synth_field_signed(char *type) -{ - if (str_has_prefix(type, "u")) - return false; - if (strcmp(type, "gfp_t") == 0) - return false; - - return true; -} - -static int synth_field_is_string(char *type) -{ - if (strstr(type, "char[") != NULL) - return true; - - return false; -} - -static int synth_field_string_size(char *type) -{ - char buf[4], *end, *start; - unsigned int len; - int size, err; - - start = strstr(type, "char["); - if (start == NULL) - return -EINVAL; - start += sizeof("char[") - 1; - - end = strchr(type, ']'); - if (!end || end < start) - return -EINVAL; - - len = end - start; - if (len > 3) - return -EINVAL; - - strncpy(buf, start, len); - buf[len] = '\0'; - - err = kstrtouint(buf, 0, &size); - if (err) - return err; - - if (size > STR_VAR_LEN_MAX) - return -EINVAL; - - return size; -} - -static int synth_field_size(char *type) -{ - int size = 0; - - if (strcmp(type, "s64") == 0) - size = sizeof(s64); - else if (strcmp(type, "u64") == 0) - size = sizeof(u64); - else if (strcmp(type, "s32") == 0) - size = sizeof(s32); - else if (strcmp(type, "u32") == 0) - size = sizeof(u32); - else if (strcmp(type, "s16") == 0) - size = sizeof(s16); - else if (strcmp(type, "u16") == 0) - size = sizeof(u16); - else if (strcmp(type, "s8") == 0) - size = sizeof(s8); - else if (strcmp(type, "u8") == 0) - size = sizeof(u8); - else if (strcmp(type, "char") == 0) - size = sizeof(char); - else if (strcmp(type, "unsigned char") == 0) - size = sizeof(unsigned char); - else if (strcmp(type, "int") == 0) - size = sizeof(int); - else if (strcmp(type, "unsigned int") == 0) - size = sizeof(unsigned int); - else if (strcmp(type, "long") == 0) - size = sizeof(long); - else if (strcmp(type, "unsigned long") == 0) - size = sizeof(unsigned long); - else if (strcmp(type, "pid_t") == 0) - size = sizeof(pid_t); - else if (strcmp(type, "gfp_t") == 0) - size = sizeof(gfp_t); - else if (synth_field_is_string(type)) - size = synth_field_string_size(type); - - return size; -} - -static const char *synth_field_fmt(char *type) -{ - const char *fmt = "%llu"; - - if (strcmp(type, "s64") == 0) - fmt = "%lld"; - else if (strcmp(type, "u64") == 0) - fmt = "%llu"; - else if (strcmp(type, "s32") == 0) - fmt = "%d"; - else if (strcmp(type, "u32") == 0) - fmt = "%u"; - else if (strcmp(type, "s16") == 0) - fmt = "%d"; - else if (strcmp(type, "u16") == 0) - fmt = "%u"; - else if (strcmp(type, "s8") == 0) - fmt = "%d"; - else if (strcmp(type, "u8") == 0) - fmt = "%u"; - else if (strcmp(type, "char") == 0) - fmt = "%d"; - else if (strcmp(type, "unsigned char") == 0) - fmt = "%u"; - else if (strcmp(type, "int") == 0) - fmt = "%d"; - else if (strcmp(type, "unsigned int") == 0) - fmt = "%u"; - else if (strcmp(type, "long") == 0) - fmt = "%ld"; - else if (strcmp(type, "unsigned long") == 0) - fmt = "%lu"; - else if (strcmp(type, "pid_t") == 0) - fmt = "%d"; - else if (strcmp(type, "gfp_t") == 0) - fmt = "%x"; - else if (synth_field_is_string(type)) - fmt = "%s"; - - return fmt; -} - -static void print_synth_event_num_val(struct trace_seq *s, - char *print_fmt, char *name, - int size, u64 val, char *space) -{ - switch (size) { - case 1: - trace_seq_printf(s, print_fmt, name, (u8)val, space); - break; - - case 2: - trace_seq_printf(s, print_fmt, name, (u16)val, space); - break; - - case 4: - trace_seq_printf(s, print_fmt, name, (u32)val, space); - break; - - default: - trace_seq_printf(s, print_fmt, name, val, space); - break; - } -} - -static enum print_line_t print_synth_event(struct trace_iterator *iter, - int flags, - struct trace_event *event) -{ - struct trace_array *tr = iter->tr; - struct trace_seq *s = &iter->seq; - struct synth_trace_event *entry; - struct synth_event *se; - unsigned int i, n_u64; - char print_fmt[32]; - const char *fmt; - - entry = (struct synth_trace_event *)iter->ent; - se = container_of(event, struct synth_event, call.event); - - trace_seq_printf(s, "%s: ", se->name); - - for (i = 0, n_u64 = 0; i < se->n_fields; i++) { - if (trace_seq_has_overflowed(s)) - goto end; - - fmt = synth_field_fmt(se->fields[i]->type); - - /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) - trace_seq_printf(s, "%s ", fmt); - - snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); - - /* parameter values */ - if (se->fields[i]->is_string) { - trace_seq_printf(s, print_fmt, se->fields[i]->name, - (char *)&entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; - char *space = (i == se->n_fields - 1 ? "" : " "); - - print_synth_event_num_val(s, print_fmt, - se->fields[i]->name, - se->fields[i]->size, - entry->fields[n_u64], - space); - - if (strcmp(se->fields[i]->type, "gfp_t") == 0) { - trace_seq_puts(s, " ("); - trace_print_flags_seq(s, "|", - entry->fields[n_u64], - __flags); - trace_seq_putc(s, ')'); - } - n_u64++; - } - } -end: - trace_seq_putc(s, '\n'); - - return trace_handle_return(s); -} - -static struct trace_event_functions synth_event_funcs = { - .trace = print_synth_event -}; - -static notrace void trace_event_raw_event_synth(void *__data, - u64 *var_ref_vals, - unsigned int *var_ref_idx) -{ - struct trace_event_file *trace_file = __data; - struct synth_trace_event *entry; - struct trace_event_buffer fbuffer; - struct trace_buffer *buffer; - struct synth_event *event; - unsigned int i, n_u64, val_idx; - int fields_size = 0; - - event = trace_file->event_call->data; - - if (trace_trigger_soft_disabled(trace_file)) - return; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - - entry = trace_event_buffer_reserve(&fbuffer, trace_file, - sizeof(*entry) + fields_size); - if (!entry) - goto out; - - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - val_idx = var_ref_idx[i]; - if (event->fields[i]->is_string) { - char *str_val = (char *)(long)var_ref_vals[val_idx]; - char *str_field = (char *)&entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = event->fields[i]; - u64 val = var_ref_vals[val_idx]; - - switch (field->size) { - case 1: - *(u8 *)&entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&entry->fields[n_u64] = (u32)val; - break; - - default: - entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } - - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); -} - -static void free_synth_event_print_fmt(struct trace_event_call *call) -{ - if (call) { - kfree(call->print_fmt); - call->print_fmt = NULL; - } -} - -static int __set_synth_event_print_fmt(struct synth_event *event, - char *buf, int len) -{ - const char *fmt; - int pos = 0; - int i; - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - for (i = 0; i < event->n_fields; i++) { - fmt = synth_field_fmt(event->fields[i]->type); - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", - event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); - } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - - for (i = 0; i < event->n_fields; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", REC->%s", event->fields[i]->name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_synth_event_print_fmt(struct trace_event_call *call) -{ - struct synth_event *event = call->data; - char *print_fmt; - int len; - - /* First: called with 0 length to calculate the needed length */ - len = __set_synth_event_print_fmt(event, NULL, 0); - - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_synth_event_print_fmt(event, print_fmt, len + 1); - call->print_fmt = print_fmt; - - return 0; -} - -static void free_synth_field(struct synth_field *field) -{ - kfree(field->type); - kfree(field->name); - kfree(field); -} - -static struct synth_field *parse_synth_field(int argc, const char **argv, - int *consumed) -{ - struct synth_field *field; - const char *prefix = NULL, *field_type = argv[0], *field_name, *array; - int len, ret = 0; - - if (field_type[0] == ';') - field_type++; - - if (!strcmp(field_type, "unsigned")) { - if (argc < 3) - return ERR_PTR(-EINVAL); - prefix = "unsigned "; - field_type = argv[1]; - field_name = argv[2]; - *consumed = 3; - } else { - field_name = argv[1]; - *consumed = 2; - } - - field = kzalloc(sizeof(*field), GFP_KERNEL); - if (!field) - return ERR_PTR(-ENOMEM); - - len = strlen(field_name); - array = strchr(field_name, '['); - if (array) - len -= strlen(array); - else if (field_name[len - 1] == ';') - len--; - - field->name = kmemdup_nul(field_name, len, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; - goto free; - } - - if (field_type[0] == ';') - field_type++; - len = strlen(field_type) + 1; - if (array) - len += strlen(array); - if (prefix) - len += strlen(prefix); - - field->type = kzalloc(len, GFP_KERNEL); - if (!field->type) { - ret = -ENOMEM; - goto free; - } - if (prefix) - strcat(field->type, prefix); - strcat(field->type, field_type); - if (array) { - strcat(field->type, array); - if (field->type[len - 1] == ';') - field->type[len - 1] = '\0'; - } - - field->size = synth_field_size(field->type); - if (!field->size) { - ret = -EINVAL; - goto free; - } - - if (synth_field_is_string(field->type)) - field->is_string = true; - - field->is_signed = synth_field_signed(field->type); - - out: - return field; - free: - free_synth_field(field); - field = ERR_PTR(ret); - goto out; -} - -static void free_synth_tracepoint(struct tracepoint *tp) -{ - if (!tp) - return; - - kfree(tp->name); - kfree(tp); -} - -static struct tracepoint *alloc_synth_tracepoint(char *name) -{ - struct tracepoint *tp; - - tp = kzalloc(sizeof(*tp), GFP_KERNEL); - if (!tp) - return ERR_PTR(-ENOMEM); - - tp->name = kstrdup(name, GFP_KERNEL); - if (!tp->name) { - kfree(tp); - return ERR_PTR(-ENOMEM); - } - - return tp; -} - -typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, - unsigned int *var_ref_idx); - -static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, - unsigned int *var_ref_idx) -{ - struct tracepoint *tp = event->tp; - - if (unlikely(atomic_read(&tp->key.enabled) > 0)) { - struct tracepoint_func *probe_func_ptr; - synth_probe_func_t probe_func; - void *__data; - - if (!(cpu_online(raw_smp_processor_id()))) - return; - - probe_func_ptr = rcu_dereference_sched((tp)->funcs); - if (probe_func_ptr) { - do { - probe_func = probe_func_ptr->func; - __data = probe_func_ptr->data; - probe_func(__data, var_ref_vals, var_ref_idx); - } while ((++probe_func_ptr)->func); - } - } -} - -static struct synth_event *find_synth_event(const char *name) -{ - struct dyn_event *pos; - struct synth_event *event; - - for_each_dyn_event(pos) { - if (!is_synth_event(pos)) - continue; - event = to_synth_event(pos); - if (strcmp(event->name, name) == 0) - return event; - } - - return NULL; -} - -static struct trace_event_fields synth_event_fields_array[] = { - { .type = TRACE_FUNCTION_TYPE, - .define_fields = synth_event_define_fields }, - {} -}; - -static int register_synth_event(struct synth_event *event) -{ - struct trace_event_call *call = &event->call; - int ret = 0; - - event->call.class = &event->class; - event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); - if (!event->class.system) { - ret = -ENOMEM; - goto out; - } - - event->tp = alloc_synth_tracepoint(event->name); - if (IS_ERR(event->tp)) { - ret = PTR_ERR(event->tp); - event->tp = NULL; - goto out; - } - - INIT_LIST_HEAD(&call->class->fields); - call->event.funcs = &synth_event_funcs; - call->class->fields_array = synth_event_fields_array; - - ret = register_trace_event(&call->event); - if (!ret) { - ret = -ENODEV; - goto out; - } - call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; - call->class->probe = trace_event_raw_event_synth; - call->data = event; - call->tp = event->tp; - - ret = trace_add_event_call(call); - if (ret) { - pr_warn("Failed to register synthetic event: %s\n", - trace_event_name(call)); - goto err; - } - - ret = set_synth_event_print_fmt(call); - if (ret < 0) { - trace_remove_event_call(call); - goto err; - } - out: - return ret; - err: - unregister_trace_event(&call->event); - goto out; -} - -static int unregister_synth_event(struct synth_event *event) -{ - struct trace_event_call *call = &event->call; - int ret; - - ret = trace_remove_event_call(call); - - return ret; -} - -static void free_synth_event(struct synth_event *event) -{ - unsigned int i; - - if (!event) - return; - - for (i = 0; i < event->n_fields; i++) - free_synth_field(event->fields[i]); - - kfree(event->fields); - kfree(event->name); - kfree(event->class.system); - free_synth_tracepoint(event->tp); - free_synth_event_print_fmt(&event->call); - kfree(event); -} - -static struct synth_event *alloc_synth_event(const char *name, int n_fields, - struct synth_field **fields) -{ - struct synth_event *event; - unsigned int i; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) { - event = ERR_PTR(-ENOMEM); - goto out; - } - - event->name = kstrdup(name, GFP_KERNEL); - if (!event->name) { - kfree(event); - event = ERR_PTR(-ENOMEM); - goto out; - } - - event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); - if (!event->fields) { - free_synth_event(event); - event = ERR_PTR(-ENOMEM); - goto out; - } - - dyn_event_init(&event->devent, &synth_event_ops); - - for (i = 0; i < n_fields; i++) - event->fields[i] = fields[i]; - - event->n_fields = n_fields; - out: - return event; -} - -static void action_trace(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, void *key, - struct action_data *data, u64 *var_ref_vals) -{ - struct synth_event *event = data->synth_event; - - trace_synth(event, var_ref_vals, data->var_ref_idx); -} - -struct hist_var_data { - struct list_head list; - struct hist_trigger_data *hist_data; -}; - -static int synth_event_check_arg_fn(void *data) -{ - struct dynevent_arg_pair *arg_pair = data; - int size; - - size = synth_field_size((char *)arg_pair->lhs); - - return size ? 0 : -EINVAL; -} - -/** - * synth_event_add_field - Add a new field to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @type: The type of the new field to add - * @name: The name of the new field to add - * - * Add a new field to a synthetic event cmd object. Field ordering is in - * the same order the fields are added. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, - const char *name) -{ - struct dynevent_arg_pair arg_pair; - int ret; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (!type || !name) - return -EINVAL; - - dynevent_arg_pair_init(&arg_pair, 0, ';'); - - arg_pair.lhs = type; - arg_pair.rhs = name; - - ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); - if (ret) - return ret; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) - ret = -EINVAL; - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_field); - -/** - * synth_event_add_field_str - Add a new field to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @type_name: The type and name of the new field to add, as a single string - * - * Add a new field to a synthetic event cmd object, as a single - * string. The @type_name string is expected to be of the form 'type - * name', which will be appended by ';'. No sanity checking is done - - * what's passed in is assumed to already be well-formed. Field - * ordering is in the same order the fields are added. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) -{ - struct dynevent_arg arg; - int ret; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (!type_name) - return -EINVAL; - - dynevent_arg_init(&arg, ';'); - - arg.str = type_name; - - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) - ret = -EINVAL; - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_field_str); - -/** - * synth_event_add_fields - Add multiple fields to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * - * Add a new set of fields to a synthetic event cmd object. The event - * fields that will be defined for the event should be passed in as an - * array of struct synth_field_desc, and the number of elements in the - * array passed in as n_fields. Field ordering will retain the - * ordering given in the fields array. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_fields(struct dynevent_cmd *cmd, - struct synth_field_desc *fields, - unsigned int n_fields) -{ - unsigned int i; - int ret = 0; - - for (i = 0; i < n_fields; i++) { - if (fields[i].type == NULL || fields[i].name == NULL) { - ret = -EINVAL; - break; - } - - ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_fields); - -/** - * __synth_event_gen_cmd_start - Start a synthetic event command from arg list - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @name: The name of the synthetic event - * @mod: The module creating the event, NULL if not created from a module - * @args: Variable number of arg (pairs), one pair for each field - * - * NOTE: Users normally won't want to call this function directly, but - * rather use the synth_event_gen_cmd_start() wrapper, which - * automatically adds a NULL to the end of the arg list. If this - * function is used directly, make sure the last arg in the variable - * arg list is NULL. - * - * Generate a synthetic event command to be executed by - * synth_event_gen_cmd_end(). This function can be used to generate - * the complete command or only the first part of it; in the latter - * case, synth_event_add_field(), synth_event_add_field_str(), or - * synth_event_add_fields() can be used to add more fields following - * this. - * - * There should be an even number variable args, each pair consisting - * of a type followed by a field name. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, - struct module *mod, ...) -{ - struct dynevent_arg arg; - va_list args; - int ret; - - cmd->event_name = name; - cmd->private_data = mod; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - dynevent_arg_init(&arg, 0); - arg.str = name; - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - va_start(args, mod); - for (;;) { - const char *type, *name; - - type = va_arg(args, const char *); - if (!type) - break; - name = va_arg(args, const char *); - if (!name) - break; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) { - ret = -EINVAL; - break; - } - - ret = synth_event_add_field(cmd, type, name); - if (ret) - break; - } - va_end(args); - - return ret; -} -EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); - -/** - * synth_event_gen_cmd_array_start - Start synthetic event command from an array - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @name: The name of the synthetic event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * - * Generate a synthetic event command to be executed by - * synth_event_gen_cmd_end(). This function can be used to generate - * the complete command or only the first part of it; in the latter - * case, synth_event_add_field(), synth_event_add_field_str(), or - * synth_event_add_fields() can be used to add more fields following - * this. - * - * The event fields that will be defined for the event should be - * passed in as an array of struct synth_field_desc, and the number of - * elements in the array passed in as n_fields. Field ordering will - * retain the ordering given in the fields array. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, - struct module *mod, - struct synth_field_desc *fields, - unsigned int n_fields) -{ - struct dynevent_arg arg; - unsigned int i; - int ret = 0; - - cmd->event_name = name; - cmd->private_data = mod; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (n_fields > SYNTH_FIELDS_MAX) - return -EINVAL; - - dynevent_arg_init(&arg, 0); - arg.str = name; - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - for (i = 0; i < n_fields; i++) { - if (fields[i].type == NULL || fields[i].name == NULL) - return -EINVAL; - - ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); - -static int __create_synth_event(int argc, const char *name, const char **argv) -{ - struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; - struct synth_event *event = NULL; - int i, consumed = 0, n_fields = 0, ret = 0; - - /* - * Argument syntax: - * - Add synthetic event: field[;field] ... - * - Remove synthetic event: ! field[;field] ... - * where 'field' = type field_name - */ - - if (name[0] == '\0' || argc < 1) - return -EINVAL; - - mutex_lock(&event_mutex); - - event = find_synth_event(name); - if (event) { - ret = -EEXIST; - goto out; - } - - for (i = 0; i < argc - 1; i++) { - if (strcmp(argv[i], ";") == 0) - continue; - if (n_fields == SYNTH_FIELDS_MAX) { - ret = -EINVAL; - goto err; - } - - field = parse_synth_field(argc - i, &argv[i], &consumed); - if (IS_ERR(field)) { - ret = PTR_ERR(field); - goto err; - } - fields[n_fields++] = field; - i += consumed - 1; - } - - if (i < argc && strcmp(argv[i], ";") != 0) { - ret = -EINVAL; - goto err; - } - - event = alloc_synth_event(name, n_fields, fields); - if (IS_ERR(event)) { - ret = PTR_ERR(event); - event = NULL; - goto err; - } - ret = register_synth_event(event); - if (!ret) - dyn_event_add(&event->devent); - else - free_synth_event(event); - out: - mutex_unlock(&event_mutex); - - return ret; - err: - for (i = 0; i < n_fields; i++) - free_synth_field(fields[i]); - - goto out; -} - -/** - * synth_event_create - Create a new synthetic event - * @name: The name of the new sythetic event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * @mod: The module creating the event, NULL if not created from a module - * - * Create a new synthetic event with the given name under the - * trace/events/synthetic/ directory. The event fields that will be - * defined for the event should be passed in as an array of struct - * synth_field_desc, and the number elements in the array passed in as - * n_fields. Field ordering will retain the ordering given in the - * fields array. - * - * If the new synthetic event is being created from a module, the mod - * param must be non-NULL. This will ensure that the trace buffer - * won't contain unreadable events. - * - * The new synth event should be deleted using synth_event_delete() - * function. The new synthetic event can be generated from modules or - * other kernel code using trace_synth_event() and related functions. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_create(const char *name, struct synth_field_desc *fields, - unsigned int n_fields, struct module *mod) -{ - struct dynevent_cmd cmd; - char *buf; - int ret; - - buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); - - ret = synth_event_gen_cmd_array_start(&cmd, name, mod, - fields, n_fields); - if (ret) - goto out; - - ret = synth_event_gen_cmd_end(&cmd); - out: - kfree(buf); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_create); - -static int destroy_synth_event(struct synth_event *se) -{ - int ret; - - if (se->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(se); - if (!ret) { - dyn_event_remove(&se->devent); - free_synth_event(se); - } - } - - return ret; -} - -/** - * synth_event_delete - Delete a synthetic event - * @event_name: The name of the new sythetic event - * - * Delete a synthetic event that was created with synth_event_create(). - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_delete(const char *event_name) -{ - struct synth_event *se = NULL; - struct module *mod = NULL; - int ret = -ENOENT; - - mutex_lock(&event_mutex); - se = find_synth_event(event_name); - if (se) { - mod = se->mod; - ret = destroy_synth_event(se); - } - mutex_unlock(&event_mutex); - - if (mod) { - mutex_lock(&trace_types_lock); - /* - * It is safest to reset the ring buffer if the module - * being unloaded registered any events that were - * used. The only worry is if a new module gets - * loaded, and takes on the same id as the events of - * this module. When printing out the buffer, traced - * events left over from this module may be passed to - * the new module events and unexpected results may - * occur. - */ - tracing_reset_all_online_cpus(); - mutex_unlock(&trace_types_lock); - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_delete); - -static int create_or_delete_synth_event(int argc, char **argv) -{ - const char *name = argv[0]; - int ret; - - /* trace_run_command() ensures argc != 0 */ - if (name[0] == '!') { - ret = synth_event_delete(name + 1); - return ret; - } - - ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); - return ret == -ECANCELED ? -EINVAL : ret; -} - -static int synth_event_run_command(struct dynevent_cmd *cmd) -{ - struct synth_event *se; - int ret; - - ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); - if (ret) - return ret; - - se = find_synth_event(cmd->event_name); - if (WARN_ON(!se)) - return -ENOENT; - - se->mod = cmd->private_data; - - return ret; -} - -/** - * synth_event_cmd_init - Initialize a synthetic event command object - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @buf: A pointer to the buffer used to build the command - * @maxlen: The length of the buffer passed in @buf - * - * Initialize a synthetic event command object. Use this before - * calling any of the other dyenvent_cmd functions. - */ -void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) -{ - dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, - synth_event_run_command); -} -EXPORT_SYMBOL_GPL(synth_event_cmd_init); - -static inline int -__synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) -{ - int entry_size, fields_size = 0; - int ret = 0; - - memset(trace_state, '\0', sizeof(*trace_state)); - - /* - * Normal event tracing doesn't get called at all unless the - * ENABLED bit is set (which attaches the probe thus allowing - * this code to be called, etc). Because this is called - * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated - * trace case, we save the enabed state upon start and just - * ignore the following data calls. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) { - trace_state->disabled = true; - goto out; - } - - trace_state->event = file->event_call->data; - - fields_size = trace_state->event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - trace_state->buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(trace_state->buffer); - - entry_size = sizeof(*trace_state->entry) + fields_size; - trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, - file, - entry_size); - if (!trace_state->entry) { - ring_buffer_nest_end(trace_state->buffer); - ret = -EINVAL; - } -out: - return ret; -} - -static inline void -__synth_event_trace_end(struct synth_event_trace_state *trace_state) -{ - trace_event_buffer_commit(&trace_state->fbuffer); - - ring_buffer_nest_end(trace_state->buffer); -} - -/** - * synth_event_trace - Trace a synthetic event - * @file: The trace_event_file representing the synthetic event - * @n_vals: The number of values in vals - * @args: Variable number of args containing the event values - * - * Trace a synthetic event using the values passed in the variable - * argument list. - * - * The argument list should be a list 'n_vals' u64 values. The number - * of vals must match the number of field in the synthetic event, and - * must be in the same order as the synthetic event fields. - * - * All vals should be cast to u64, and string vals are just pointers - * to strings, cast to u64. Strings will be copied into space - * reserved in the event for the string, using these pointers. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) -{ - struct synth_event_trace_state state; - unsigned int i, n_u64; - va_list args; - int ret; - - ret = __synth_event_trace_start(file, &state); - if (ret || state.disabled) - return ret; - - if (n_vals != state.event->n_fields) { - ret = -EINVAL; - goto out; - } - - va_start(args, n_vals); - for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { - u64 val; - - val = va_arg(args, u64); - - if (state.event->fields[i]->is_string) { - char *str_val = (char *)(long)val; - char *str_field = (char *)&state.entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = state.event->fields[i]; - - switch (field->size) { - case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; - break; - - default: - state.entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } - va_end(args); -out: - __synth_event_trace_end(&state); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace); - -/** - * synth_event_trace_array - Trace a synthetic event from an array - * @file: The trace_event_file representing the synthetic event - * @vals: Array of values - * @n_vals: The number of values in vals - * - * Trace a synthetic event using the values passed in as 'vals'. - * - * The 'vals' array is just an array of 'n_vals' u64. The number of - * vals must match the number of field in the synthetic event, and - * must be in the same order as the synthetic event fields. - * - * All vals should be cast to u64, and string vals are just pointers - * to strings, cast to u64. Strings will be copied into space - * reserved in the event for the string, using these pointers. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_array(struct trace_event_file *file, u64 *vals, - unsigned int n_vals) -{ - struct synth_event_trace_state state; - unsigned int i, n_u64; - int ret; - - ret = __synth_event_trace_start(file, &state); - if (ret || state.disabled) - return ret; - - if (n_vals != state.event->n_fields) { - ret = -EINVAL; - goto out; - } - - for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { - if (state.event->fields[i]->is_string) { - char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&state.entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = state.event->fields[i]; - u64 val = vals[i]; - - switch (field->size) { - case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; - break; - - default: - state.entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } -out: - __synth_event_trace_end(&state); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace_array); - -/** - * synth_event_trace_start - Start piecewise synthetic event trace - * @file: The trace_event_file representing the synthetic event - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Start the trace of a synthetic event field-by-field rather than all - * at once. - * - * This function 'opens' an event trace, which means space is reserved - * for the event in the trace buffer, after which the event's - * individual field values can be set through either - * synth_event_add_next_val() or synth_event_add_val(). - * - * A pointer to a trace_state object is passed in, which will keep - * track of the current event trace state until the event trace is - * closed (and the event finally traced) using - * synth_event_trace_end(). - * - * Note that synth_event_trace_end() must be called after all values - * have been added for each event trace, regardless of whether adding - * all field values succeeded or not. - * - * Note also that for a given event trace, all fields must be added - * using either synth_event_add_next_val() or synth_event_add_val() - * but not both together or interleaved. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) -{ - if (!trace_state) - return -EINVAL; + char *event_system; + } match_data; - return __synth_event_trace_start(file, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_trace_start); + struct { + /* + * var_str contains the $-unstripped variable + * name referenced by var_ref, and used when + * printing the action. Because var_ref + * creation is deferred to create_actions(), + * we need a per-action way to save it until + * then, thus var_str. + */ + char *var_str; -static int __synth_event_add_val(const char *field_name, u64 val, - struct synth_event_trace_state *trace_state) -{ - struct synth_field *field = NULL; - struct synth_trace_event *entry; - struct synth_event *event; - int i, ret = 0; + /* + * var_ref refers to the variable being + * tracked e.g onmax($var). + */ + struct hist_field *var_ref; - if (!trace_state) { - ret = -EINVAL; - goto out; - } + /* + * track_var contains the 'invisible' tracking + * variable created to keep the current + * e.g. max value. + */ + struct hist_field *track_var; - /* can't mix add_next_synth_val() with add_synth_val() */ - if (field_name) { - if (trace_state->add_next) { - ret = -EINVAL; - goto out; - } - trace_state->add_name = true; - } else { - if (trace_state->add_name) { - ret = -EINVAL; - goto out; - } - trace_state->add_next = true; - } + check_track_val_fn_t check_val; + action_fn_t save_data; + } track_data; + }; +}; - if (trace_state->disabled) - goto out; +struct track_data { + u64 track_val; + bool updated; - event = trace_state->event; - if (trace_state->add_name) { - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; - if (strcmp(field->name, field_name) == 0) - break; - } - if (!field) { - ret = -EINVAL; - goto out; - } - } else { - if (trace_state->cur_field >= event->n_fields) { - ret = -EINVAL; - goto out; - } - field = event->fields[trace_state->cur_field++]; - } + unsigned int key_len; + void *key; + struct tracing_map_elt elt; - entry = trace_state->entry; - if (field->is_string) { - char *str_val = (char *)(long)val; - char *str_field; + struct action_data *action_data; + struct hist_trigger_data *hist_data; +}; - if (!str_val) { - ret = -EINVAL; - goto out; - } +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; +}; - str_field = (char *)&entry->fields[field->offset]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else { - switch (field->size) { - case 1: - *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; - break; +struct snapshot_context { + struct tracing_map_elt *elt; + void *key; +}; - case 2: - *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; - break; +static void track_data_free(struct track_data *track_data) +{ + struct hist_elt_data *elt_data; - case 4: - *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; - break; + if (!track_data) + return; - default: - trace_state->entry->fields[field->offset] = val; - break; - } + kfree(track_data->key); + + elt_data = track_data->elt.private_data; + if (elt_data) { + kfree(elt_data->comm); + kfree(elt_data); } - out: - return ret; -} -/** - * synth_event_add_next_val - Add the next field's value to an open synth trace - * @val: The value to set the next field to - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Set the value of the next field in an event that's been opened by - * synth_event_trace_start(). - * - * The val param should be the value cast to u64. If the value points - * to a string, the val param should be a char * cast to u64. - * - * This function assumes all the fields in an event are to be set one - * after another - successive calls to this function are made, one for - * each field, in the order of the fields in the event, until all - * fields have been set. If you'd rather set each field individually - * without regard to ordering, synth_event_add_val() can be used - * instead. - * - * Note however that synth_event_add_next_val() and - * synth_event_add_val() can't be intermixed for a given event trace - - * one or the other but not both can be used at the same time. - * - * Note also that synth_event_trace_end() must be called after all - * values have been added for each event trace, regardless of whether - * adding all field values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_add_next_val(u64 val, - struct synth_event_trace_state *trace_state) -{ - return __synth_event_add_val(NULL, val, trace_state); + kfree(track_data); } -EXPORT_SYMBOL_GPL(synth_event_add_next_val); -/** - * synth_event_add_val - Add a named field's value to an open synth trace - * @field_name: The name of the synthetic event field value to set - * @val: The value to set the next field to - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Set the value of the named field in an event that's been opened by - * synth_event_trace_start(). - * - * The val param should be the value cast to u64. If the value points - * to a string, the val param should be a char * cast to u64. - * - * This function looks up the field name, and if found, sets the field - * to the specified value. This lookup makes this function more - * expensive than synth_event_add_next_val(), so use that or the - * none-piecewise synth_event_trace() instead if efficiency is more - * important. - * - * Note however that synth_event_add_next_val() and - * synth_event_add_val() can't be intermixed for a given event trace - - * one or the other but not both can be used at the same time. - * - * Note also that synth_event_trace_end() must be called after all - * values have been added for each event trace, regardless of whether - * adding all field values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_add_val(const char *field_name, u64 val, - struct synth_event_trace_state *trace_state) +static struct track_data *track_data_alloc(unsigned int key_len, + struct action_data *action_data, + struct hist_trigger_data *hist_data) { - return __synth_event_add_val(field_name, val, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_add_val); + struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + struct hist_elt_data *elt_data; -/** - * synth_event_trace_end - End piecewise synthetic event trace - * @trace_state: A pointer to object tracking the piecewise trace state - * - * End the trace of a synthetic event opened by - * synth_event_trace__start(). - * - * This function 'closes' an event trace, which basically means that - * it commits the reserved event and cleans up other loose ends. - * - * A pointer to a trace_state object is passed in, which will keep - * track of the current event trace state opened with - * synth_event_trace_start(). - * - * Note that this function must be called after all values have been - * added for each event trace, regardless of whether adding all field - * values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_end(struct synth_event_trace_state *trace_state) -{ - if (!trace_state) - return -EINVAL; + if (!data) + return ERR_PTR(-ENOMEM); - __synth_event_trace_end(trace_state); + data->key = kzalloc(key_len, GFP_KERNEL); + if (!data->key) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } - return 0; -} -EXPORT_SYMBOL_GPL(synth_event_trace_end); + data->key_len = key_len; + data->action_data = action_data; + data->hist_data = hist_data; -static int create_synth_event(int argc, const char **argv) -{ - const char *name = argv[0]; - int len; + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; + data->elt.private_data = elt_data; - /* This interface accepts group name prefix */ - if (strchr(name, '/')) { - len = str_has_prefix(name, SYNTH_SYSTEM "/"); - if (len == 0) - return -EINVAL; - name += len; + elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!elt_data->comm) { + track_data_free(data); + return ERR_PTR(-ENOMEM); } - return __create_synth_event(argc - 1, name, argv + 1); -} - -static int synth_event_release(struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - int ret; - if (event->ref) - return -EBUSY; + return data; +} - ret = unregister_synth_event(event); - if (ret) - return ret; +static char last_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd_loc[MAX_FILTER_STR_VAL]; - dyn_event_remove(ev); - free_synth_event(event); - return 0; +static int errpos(char *str) +{ + return err_pos(last_cmd, str); } -static int __synth_event_show(struct seq_file *m, struct synth_event *event) +static void last_cmd_set(struct trace_event_file *file, char *str) { - struct synth_field *field; - unsigned int i; + const char *system = NULL, *name = NULL; + struct trace_event_call *call; - seq_printf(m, "%s\t", event->name); + if (!str) + return; - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; + strcpy(last_cmd, "hist:"); + strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:")); - /* parameter values */ - seq_printf(m, "%s %s%s", field->type, field->name, - i == event->n_fields - 1 ? "" : "; "); + if (file) { + call = file->event_call; + system = call->class->system; + if (system) { + name = trace_event_name(call); + if (!name) + system = NULL; + } } - seq_putc(m, '\n'); - - return 0; + if (system) + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } -static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) { - struct synth_event *event = to_synth_event(ev); - - seq_printf(m, "s:%s/", event->class.system); - - return __synth_event_show(m, event); + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, + err_type, err_pos); } -static int synth_events_seq_show(struct seq_file *m, void *v) +static void hist_err_clear(void) { - struct dyn_event *ev = v; - - if (!is_synth_event(ev)) - return 0; - - return __synth_event_show(m, to_synth_event(ev)); + last_cmd[0] = '\0'; + last_cmd_loc[0] = '\0'; } -static const struct seq_operations synth_events_seq_op = { - .start = dyn_event_seq_start, - .next = dyn_event_seq_next, - .stop = dyn_event_seq_stop, - .show = synth_events_seq_show, -}; +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, + unsigned int *var_ref_idx); -static int synth_events_open(struct inode *inode, struct file *file) +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, + unsigned int *var_ref_idx) { - int ret; + struct tracepoint *tp = event->tp; - ret = security_locked_down(LOCKDOWN_TRACEFS); - if (ret) - return ret; + if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + synth_probe_func_t probe_func; + void *__data; - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = dyn_events_release_all(&synth_event_ops); - if (ret < 0) - return ret; - } + if (!(cpu_online(raw_smp_processor_id()))) + return; - return seq_open(file, &synth_events_seq_op); + probe_func_ptr = rcu_dereference_sched((tp)->funcs); + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + __data = probe_func_ptr->data; + probe_func(__data, var_ref_vals, var_ref_idx); + } while ((++probe_func_ptr)->func); + } + } } -static ssize_t synth_events_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) +static void action_trace(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - return trace_parse_run_command(file, buffer, count, ppos, - create_or_delete_synth_event); + struct synth_event *event = data->synth_event; + + trace_synth(event, var_ref_vals, data->var_ref_idx); } -static const struct file_operations synth_events_fops = { - .open = synth_events_open, - .write = synth_events_write, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +struct hist_var_data { + struct list_head list; + struct hist_trigger_data *hist_data; }; static u64 hist_field_timestamp(struct hist_field *hist_field, @@ -7653,37 +5904,3 @@ __init int register_trigger_hist_enable_disable_cmds(void) return ret; } - -static __init int trace_events_hist_init(void) -{ - struct dentry *entry = NULL; - struct dentry *d_tracer; - int err = 0; - - err = dyn_event_register(&synth_event_ops); - if (err) { - pr_warn("Could not register synth_event_ops\n"); - return err; - } - - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) { - err = PTR_ERR(d_tracer); - goto err; - } - - entry = tracefs_create_file("synthetic_events", 0644, d_tracer, - NULL, &synth_events_fops); - if (!entry) { - err = -ENODEV; - goto err; - } - - return err; - err: - pr_warn("Could not create tracefs 'synthetic_events' entry\n"); - - return err; -} - -fs_initcall(trace_events_hist_init); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c new file mode 100644 index 000000000000..c6cca0d1d584 --- /dev/null +++ b/kernel/trace/trace_events_synth.c @@ -0,0 +1,1789 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_synth - synthetic trace events + * + * Copyright (C) 2015, 2020 Tom Zanussi + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* for gfp flag names */ +#include +#include + +#include "trace_synth.h" + +static int create_synth_event(int argc, const char **argv); +static int synth_event_show(struct seq_file *m, struct dyn_event *ev); +static int synth_event_release(struct dyn_event *ev); +static bool synth_event_is_busy(struct dyn_event *ev); +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations synth_event_ops = { + .create = create_synth_event, + .show = synth_event_show, + .is_busy = synth_event_is_busy, + .free = synth_event_release, + .match = synth_event_match, +}; + +static bool is_synth_event(struct dyn_event *ev) +{ + return ev->ops == &synth_event_ops; +} + +static struct synth_event *to_synth_event(struct dyn_event *ev) +{ + return container_of(ev, struct synth_event, devent); +} + +static bool synth_event_is_busy(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + return event->ref != 0; +} + +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct synth_event *sev = to_synth_event(ev); + + return strcmp(sev->name, event) == 0 && + (!system || strcmp(system, SYNTH_SYSTEM) == 0); +} + +struct synth_trace_event { + struct trace_entry ent; + u64 fields[]; +}; + +static int synth_event_define_fields(struct trace_event_call *call) +{ + struct synth_trace_event trace; + int offset = offsetof(typeof(trace), fields); + struct synth_event *event = call->data; + unsigned int i, size, n_u64; + char *name, *type; + bool is_signed; + int ret = 0; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + size = event->fields[i]->size; + is_signed = event->fields[i]->is_signed; + type = event->fields[i]->type; + name = event->fields[i]->name; + ret = trace_define_field(call, type, name, offset, size, + is_signed, FILTER_OTHER); + if (ret) + break; + + event->fields[i]->offset = n_u64; + + if (event->fields[i]->is_string) { + offset += STR_VAR_LEN_MAX; + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + offset += sizeof(u64); + n_u64++; + } + } + + event->n_u64 = n_u64; + + return ret; +} + +static bool synth_field_signed(char *type) +{ + if (str_has_prefix(type, "u")) + return false; + if (strcmp(type, "gfp_t") == 0) + return false; + + return true; +} + +static int synth_field_is_string(char *type) +{ + if (strstr(type, "char[") != NULL) + return true; + + return false; +} + +static int synth_field_string_size(char *type) +{ + char buf[4], *end, *start; + unsigned int len; + int size, err; + + start = strstr(type, "char["); + if (start == NULL) + return -EINVAL; + start += sizeof("char[") - 1; + + end = strchr(type, ']'); + if (!end || end < start) + return -EINVAL; + + len = end - start; + if (len > 3) + return -EINVAL; + + strncpy(buf, start, len); + buf[len] = '\0'; + + err = kstrtouint(buf, 0, &size); + if (err) + return err; + + if (size > STR_VAR_LEN_MAX) + return -EINVAL; + + return size; +} + +static int synth_field_size(char *type) +{ + int size = 0; + + if (strcmp(type, "s64") == 0) + size = sizeof(s64); + else if (strcmp(type, "u64") == 0) + size = sizeof(u64); + else if (strcmp(type, "s32") == 0) + size = sizeof(s32); + else if (strcmp(type, "u32") == 0) + size = sizeof(u32); + else if (strcmp(type, "s16") == 0) + size = sizeof(s16); + else if (strcmp(type, "u16") == 0) + size = sizeof(u16); + else if (strcmp(type, "s8") == 0) + size = sizeof(s8); + else if (strcmp(type, "u8") == 0) + size = sizeof(u8); + else if (strcmp(type, "char") == 0) + size = sizeof(char); + else if (strcmp(type, "unsigned char") == 0) + size = sizeof(unsigned char); + else if (strcmp(type, "int") == 0) + size = sizeof(int); + else if (strcmp(type, "unsigned int") == 0) + size = sizeof(unsigned int); + else if (strcmp(type, "long") == 0) + size = sizeof(long); + else if (strcmp(type, "unsigned long") == 0) + size = sizeof(unsigned long); + else if (strcmp(type, "pid_t") == 0) + size = sizeof(pid_t); + else if (strcmp(type, "gfp_t") == 0) + size = sizeof(gfp_t); + else if (synth_field_is_string(type)) + size = synth_field_string_size(type); + + return size; +} + +static const char *synth_field_fmt(char *type) +{ + const char *fmt = "%llu"; + + if (strcmp(type, "s64") == 0) + fmt = "%lld"; + else if (strcmp(type, "u64") == 0) + fmt = "%llu"; + else if (strcmp(type, "s32") == 0) + fmt = "%d"; + else if (strcmp(type, "u32") == 0) + fmt = "%u"; + else if (strcmp(type, "s16") == 0) + fmt = "%d"; + else if (strcmp(type, "u16") == 0) + fmt = "%u"; + else if (strcmp(type, "s8") == 0) + fmt = "%d"; + else if (strcmp(type, "u8") == 0) + fmt = "%u"; + else if (strcmp(type, "char") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned char") == 0) + fmt = "%u"; + else if (strcmp(type, "int") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned int") == 0) + fmt = "%u"; + else if (strcmp(type, "long") == 0) + fmt = "%ld"; + else if (strcmp(type, "unsigned long") == 0) + fmt = "%lu"; + else if (strcmp(type, "pid_t") == 0) + fmt = "%d"; + else if (strcmp(type, "gfp_t") == 0) + fmt = "%x"; + else if (synth_field_is_string(type)) + fmt = "%s"; + + return fmt; +} + +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + +static enum print_line_t print_synth_event(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct synth_trace_event *entry; + struct synth_event *se; + unsigned int i, n_u64; + char print_fmt[32]; + const char *fmt; + + entry = (struct synth_trace_event *)iter->ent; + se = container_of(event, struct synth_event, call.event); + + trace_seq_printf(s, "%s: ", se->name); + + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { + if (trace_seq_has_overflowed(s)) + goto end; + + fmt = synth_field_fmt(se->fields[i]->type); + + /* parameter types */ + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", fmt); + + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); + + /* parameter values */ + if (se->fields[i]->is_string) { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct trace_print_flags __flags[] = { + __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); + + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); + + if (strcmp(se->fields[i]->type, "gfp_t") == 0) { + trace_seq_puts(s, " ("); + trace_print_flags_seq(s, "|", + entry->fields[n_u64], + __flags); + trace_seq_putc(s, ')'); + } + n_u64++; + } + } +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static struct trace_event_functions synth_event_funcs = { + .trace = print_synth_event +}; + +static notrace void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct trace_buffer *buffer; + struct synth_event *event; + unsigned int i, n_u64, val_idx; + int fields_size = 0; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + goto out; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + val_idx = var_ref_idx[i]; + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)var_ref_vals[val_idx]; + char *str_field = (char *)&entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = event->fields[i]; + u64 val = var_ref_vals[val_idx]; + + switch (field->size) { + case 1: + *(u8 *)&entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&entry->fields[n_u64] = (u32)val; + break; + + default: + entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); +} + +static void free_synth_event_print_fmt(struct trace_event_call *call) +{ + if (call) { + kfree(call->print_fmt); + call->print_fmt = NULL; + } +} + +static int __set_synth_event_print_fmt(struct synth_event *event, + char *buf, int len) +{ + const char *fmt; + int pos = 0; + int i; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < event->n_fields; i++) { + fmt = synth_field_fmt(event->fields[i]->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", + event->fields[i]->name, fmt, + i == event->n_fields - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < event->n_fields; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_synth_event_print_fmt(struct trace_event_call *call) +{ + struct synth_event *event = call->data; + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_synth_event_print_fmt(event, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_synth_event_print_fmt(event, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_synth_field(struct synth_field *field) +{ + kfree(field->type); + kfree(field->name); + kfree(field); +} + +static struct synth_field *parse_synth_field(int argc, const char **argv, + int *consumed) +{ + struct synth_field *field; + const char *prefix = NULL, *field_type = argv[0], *field_name, *array; + int len, ret = 0; + + if (field_type[0] == ';') + field_type++; + + if (!strcmp(field_type, "unsigned")) { + if (argc < 3) + return ERR_PTR(-EINVAL); + prefix = "unsigned "; + field_type = argv[1]; + field_name = argv[2]; + *consumed = 3; + } else { + field_name = argv[1]; + *consumed = 2; + } + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + return ERR_PTR(-ENOMEM); + + len = strlen(field_name); + array = strchr(field_name, '['); + if (array) + len -= strlen(array); + else if (field_name[len - 1] == ';') + len--; + + field->name = kmemdup_nul(field_name, len, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + + if (field_type[0] == ';') + field_type++; + len = strlen(field_type) + 1; + if (array) + len += strlen(array); + if (prefix) + len += strlen(prefix); + + field->type = kzalloc(len, GFP_KERNEL); + if (!field->type) { + ret = -ENOMEM; + goto free; + } + if (prefix) + strcat(field->type, prefix); + strcat(field->type, field_type); + if (array) { + strcat(field->type, array); + if (field->type[len - 1] == ';') + field->type[len - 1] = '\0'; + } + + field->size = synth_field_size(field->type); + if (!field->size) { + ret = -EINVAL; + goto free; + } + + if (synth_field_is_string(field->type)) + field->is_string = true; + + field->is_signed = synth_field_signed(field->type); + + out: + return field; + free: + free_synth_field(field); + field = ERR_PTR(ret); + goto out; +} + +static void free_synth_tracepoint(struct tracepoint *tp) +{ + if (!tp) + return; + + kfree(tp->name); + kfree(tp); +} + +static struct tracepoint *alloc_synth_tracepoint(char *name) +{ + struct tracepoint *tp; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + tp->name = kstrdup(name, GFP_KERNEL); + if (!tp->name) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + + return tp; +} + +struct synth_event *find_synth_event(const char *name) +{ + struct dyn_event *pos; + struct synth_event *event; + + for_each_dyn_event(pos) { + if (!is_synth_event(pos)) + continue; + event = to_synth_event(pos); + if (strcmp(event->name, name) == 0) + return event; + } + + return NULL; +} + +static struct trace_event_fields synth_event_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = synth_event_define_fields }, + {} +}; + +static int register_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret = 0; + + event->call.class = &event->class; + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); + if (!event->class.system) { + ret = -ENOMEM; + goto out; + } + + event->tp = alloc_synth_tracepoint(event->name); + if (IS_ERR(event->tp)) { + ret = PTR_ERR(event->tp); + event->tp = NULL; + goto out; + } + + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &synth_event_funcs; + call->class->fields_array = synth_event_fields_array; + + ret = register_trace_event(&call->event); + if (!ret) { + ret = -ENODEV; + goto out; + } + call->flags = TRACE_EVENT_FL_TRACEPOINT; + call->class->reg = trace_event_reg; + call->class->probe = trace_event_raw_event_synth; + call->data = event; + call->tp = event->tp; + + ret = trace_add_event_call(call); + if (ret) { + pr_warn("Failed to register synthetic event: %s\n", + trace_event_name(call)); + goto err; + } + + ret = set_synth_event_print_fmt(call); + if (ret < 0) { + trace_remove_event_call(call); + goto err; + } + out: + return ret; + err: + unregister_trace_event(&call->event); + goto out; +} + +static int unregister_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret; + + ret = trace_remove_event_call(call); + + return ret; +} + +static void free_synth_event(struct synth_event *event) +{ + unsigned int i; + + if (!event) + return; + + for (i = 0; i < event->n_fields; i++) + free_synth_field(event->fields[i]); + + kfree(event->fields); + kfree(event->name); + kfree(event->class.system); + free_synth_tracepoint(event->tp); + free_synth_event_print_fmt(&event->call); + kfree(event); +} + +static struct synth_event *alloc_synth_event(const char *name, int n_fields, + struct synth_field **fields) +{ + struct synth_event *event; + unsigned int i; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) { + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->name = kstrdup(name, GFP_KERNEL); + if (!event->name) { + kfree(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + if (!event->fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + dyn_event_init(&event->devent, &synth_event_ops); + + for (i = 0; i < n_fields; i++) + event->fields[i] = fields[i]; + + event->n_fields = n_fields; + out: + return event; +} + +static int synth_event_check_arg_fn(void *data) +{ + struct dynevent_arg_pair *arg_pair = data; + int size; + + size = synth_field_size((char *)arg_pair->lhs); + + return size ? 0 : -EINVAL; +} + +/** + * synth_event_add_field - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type: The type of the new field to add + * @name: The name of the new field to add + * + * Add a new field to a synthetic event cmd object. Field ordering is in + * the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, + const char *name) +{ + struct dynevent_arg_pair arg_pair; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type || !name) + return -EINVAL; + + dynevent_arg_pair_init(&arg_pair, 0, ';'); + + arg_pair.lhs = type; + arg_pair.rhs = name; + + ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field); + +/** + * synth_event_add_field_str - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type_name: The type and name of the new field to add, as a single string + * + * Add a new field to a synthetic event cmd object, as a single + * string. The @type_name string is expected to be of the form 'type + * name', which will be appended by ';'. No sanity checking is done - + * what's passed in is assumed to already be well-formed. Field + * ordering is in the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) +{ + struct dynevent_arg arg; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type_name) + return -EINVAL; + + dynevent_arg_init(&arg, ';'); + + arg.str = type_name; + + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field_str); + +/** + * synth_event_add_fields - Add multiple fields to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Add a new set of fields to a synthetic event cmd object. The event + * fields that will be defined for the event should be passed in as an + * array of struct synth_field_desc, and the number of elements in the + * array passed in as n_fields. Field ordering will retain the + * ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_fields(struct dynevent_cmd *cmd, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_fields); + +/** + * __synth_event_gen_cmd_start - Start a synthetic event command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module + * @args: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the synth_event_gen_cmd_start() wrapper, which + * automatically adds a NULL to the end of the arg list. If this + * function is used directly, make sure the last arg in the variable + * arg list is NULL. + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * There should be an even number variable args, each pair consisting + * of a type followed by a field name. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, ...) +{ + struct dynevent_arg arg; + va_list args; + int ret; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + va_start(args, mod); + for (;;) { + const char *type, *name; + + type = va_arg(args, const char *); + if (!type) + break; + name = va_arg(args, const char *); + if (!name) + break; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, type, name); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); + +/** + * synth_event_gen_cmd_array_start - Start synthetic event command from an array + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * The event fields that will be defined for the event should be + * passed in as an array of struct synth_field_desc, and the number of + * elements in the array passed in as n_fields. Field ordering will + * retain the ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + struct dynevent_arg arg; + unsigned int i; + int ret = 0; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (n_fields > SYNTH_FIELDS_MAX) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) + return -EINVAL; + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); + +static int __create_synth_event(int argc, const char *name, const char **argv) +{ + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + struct synth_event *event = NULL; + int i, consumed = 0, n_fields = 0, ret = 0; + + /* + * Argument syntax: + * - Add synthetic event: field[;field] ... + * - Remove synthetic event: ! field[;field] ... + * where 'field' = type field_name + */ + + if (name[0] == '\0' || argc < 1) + return -EINVAL; + + mutex_lock(&event_mutex); + + event = find_synth_event(name); + if (event) { + ret = -EEXIST; + goto out; + } + + for (i = 0; i < argc - 1; i++) { + if (strcmp(argv[i], ";") == 0) + continue; + if (n_fields == SYNTH_FIELDS_MAX) { + ret = -EINVAL; + goto err; + } + + field = parse_synth_field(argc - i, &argv[i], &consumed); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto err; + } + fields[n_fields++] = field; + i += consumed - 1; + } + + if (i < argc && strcmp(argv[i], ";") != 0) { + ret = -EINVAL; + goto err; + } + + event = alloc_synth_event(name, n_fields, fields); + if (IS_ERR(event)) { + ret = PTR_ERR(event); + event = NULL; + goto err; + } + ret = register_synth_event(event); + if (!ret) + dyn_event_add(&event->devent); + else + free_synth_event(event); + out: + mutex_unlock(&event_mutex); + + return ret; + err: + for (i = 0; i < n_fields; i++) + free_synth_field(fields[i]); + + goto out; +} + +/** + * synth_event_create - Create a new synthetic event + * @name: The name of the new sythetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * @mod: The module creating the event, NULL if not created from a module + * + * Create a new synthetic event with the given name under the + * trace/events/synthetic/ directory. The event fields that will be + * defined for the event should be passed in as an array of struct + * synth_field_desc, and the number elements in the array passed in as + * n_fields. Field ordering will retain the ordering given in the + * fields array. + * + * If the new synthetic event is being created from a module, the mod + * param must be non-NULL. This will ensure that the trace buffer + * won't contain unreadable events. + * + * The new synth event should be deleted using synth_event_delete() + * function. The new synthetic event can be generated from modules or + * other kernel code using trace_synth_event() and related functions. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_create(const char *name, struct synth_field_desc *fields, + unsigned int n_fields, struct module *mod) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + ret = synth_event_gen_cmd_array_start(&cmd, name, mod, + fields, n_fields); + if (ret) + goto out; + + ret = synth_event_gen_cmd_end(&cmd); + out: + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_create); + +static int destroy_synth_event(struct synth_event *se) +{ + int ret; + + if (se->ref) + ret = -EBUSY; + else { + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); + } + } + + return ret; +} + +/** + * synth_event_delete - Delete a synthetic event + * @event_name: The name of the new sythetic event + * + * Delete a synthetic event that was created with synth_event_create(). + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_delete(const char *event_name) +{ + struct synth_event *se = NULL; + struct module *mod = NULL; + int ret = -ENOENT; + + mutex_lock(&event_mutex); + se = find_synth_event(event_name); + if (se) { + mod = se->mod; + ret = destroy_synth_event(se); + } + mutex_unlock(&event_mutex); + + if (mod) { + mutex_lock(&trace_types_lock); + /* + * It is safest to reset the ring buffer if the module + * being unloaded registered any events that were + * used. The only worry is if a new module gets + * loaded, and takes on the same id as the events of + * this module. When printing out the buffer, traced + * events left over from this module may be passed to + * the new module events and unexpected results may + * occur. + */ + tracing_reset_all_online_cpus(); + mutex_unlock(&trace_types_lock); + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_delete); + +static int create_or_delete_synth_event(int argc, char **argv) +{ + const char *name = argv[0]; + int ret; + + /* trace_run_command() ensures argc != 0 */ + if (name[0] == '!') { + ret = synth_event_delete(name + 1); + return ret; + } + + ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); + return ret == -ECANCELED ? -EINVAL : ret; +} + +static int synth_event_run_command(struct dynevent_cmd *cmd) +{ + struct synth_event *se; + int ret; + + ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); + if (ret) + return ret; + + se = find_synth_event(cmd->event_name); + if (WARN_ON(!se)) + return -ENOENT; + + se->mod = cmd->private_data; + + return ret; +} + +/** + * synth_event_cmd_init - Initialize a synthetic event command object + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @buf: A pointer to the buffer used to build the command + * @maxlen: The length of the buffer passed in @buf + * + * Initialize a synthetic event command object. Use this before + * calling any of the other dyenvent_cmd functions. + */ +void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) +{ + dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, + synth_event_run_command); +} +EXPORT_SYMBOL_GPL(synth_event_cmd_init); + +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int entry_size, fields_size = 0; + int ret = 0; + + memset(trace_state, '\0', sizeof(*trace_state)); + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } +out: + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + +/** + * synth_event_trace - Trace a synthetic event + * @file: The trace_event_file representing the synthetic event + * @n_vals: The number of values in vals + * @args: Variable number of args containing the event values + * + * Trace a synthetic event using the values passed in the variable + * argument list. + * + * The argument list should be a list 'n_vals' u64 values. The number + * of vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) +{ + struct synth_event_trace_state state; + unsigned int i, n_u64; + va_list args; + int ret; + + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + va_start(args, n_vals); + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + u64 val; + + val = va_arg(args, u64); + + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)val; + char *str_field = (char *)&state.entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } + va_end(args); +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace); + +/** + * synth_event_trace_array - Trace a synthetic event from an array + * @file: The trace_event_file representing the synthetic event + * @vals: Array of values + * @n_vals: The number of values in vals + * + * Trace a synthetic event using the values passed in as 'vals'. + * + * The 'vals' array is just an array of 'n_vals' u64. The number of + * vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_array(struct trace_event_file *file, u64 *vals, + unsigned int n_vals) +{ + struct synth_event_trace_state state; + unsigned int i, n_u64; + int ret; + + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)vals[i]; + char *str_field = (char *)&state.entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_array); + +/** + * synth_event_trace_start - Start piecewise synthetic event trace + * @file: The trace_event_file representing the synthetic event + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Start the trace of a synthetic event field-by-field rather than all + * at once. + * + * This function 'opens' an event trace, which means space is reserved + * for the event in the trace buffer, after which the event's + * individual field values can be set through either + * synth_event_add_next_val() or synth_event_add_val(). + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state until the event trace is + * closed (and the event finally traced) using + * synth_event_trace_end(). + * + * Note that synth_event_trace_end() must be called after all values + * have been added for each event trace, regardless of whether adding + * all field values succeeded or not. + * + * Note also that for a given event trace, all fields must be added + * using either synth_event_add_next_val() or synth_event_add_val() + * but not both together or interleaved. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int ret; + + if (!trace_state) + return -EINVAL; + + ret = __synth_event_trace_start(file, trace_state); + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_start); + +static int __synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + struct synth_field *field = NULL; + struct synth_trace_event *entry; + struct synth_event *event; + int i, ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + /* can't mix add_next_synth_val() with add_synth_val() */ + if (field_name) { + if (trace_state->add_next) { + ret = -EINVAL; + goto out; + } + trace_state->add_name = true; + } else { + if (trace_state->add_name) { + ret = -EINVAL; + goto out; + } + trace_state->add_next = true; + } + + if (trace_state->disabled) + goto out; + + event = trace_state->event; + if (trace_state->add_name) { + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + if (strcmp(field->name, field_name) == 0) + break; + } + if (!field) { + ret = -EINVAL; + goto out; + } + } else { + if (trace_state->cur_field >= event->n_fields) { + ret = -EINVAL; + goto out; + } + field = event->fields[trace_state->cur_field++]; + } + + entry = trace_state->entry; + if (field->is_string) { + char *str_val = (char *)(long)val; + char *str_field; + + if (!str_val) { + ret = -EINVAL; + goto out; + } + + str_field = (char *)&entry->fields[field->offset]; + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } + out: + return ret; +} + +/** + * synth_event_add_next_val - Add the next field's value to an open synth trace + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the next field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function assumes all the fields in an event are to be set one + * after another - successive calls to this function are made, one for + * each field, in the order of the fields in the event, until all + * fields have been set. If you'd rather set each field individually + * without regard to ordering, synth_event_add_val() can be used + * instead. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_next_val(u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(NULL, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_next_val); + +/** + * synth_event_add_val - Add a named field's value to an open synth trace + * @field_name: The name of the synthetic event field value to set + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the named field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function looks up the field name, and if found, sets the field + * to the specified value. This lookup makes this function more + * expensive than synth_event_add_next_val(), so use that or the + * none-piecewise synth_event_trace() instead if efficiency is more + * important. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(field_name, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_val); + +/** + * synth_event_trace_end - End piecewise synthetic event trace + * @trace_state: A pointer to object tracking the piecewise trace state + * + * End the trace of a synthetic event opened by + * synth_event_trace__start(). + * + * This function 'closes' an event trace, which basically means that + * it commits the reserved event and cleans up other loose ends. + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state opened with + * synth_event_trace_start(). + * + * Note that this function must be called after all values have been + * added for each event trace, regardless of whether adding all field + * values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + if (!trace_state) + return -EINVAL; + + __synth_event_trace_end(trace_state); + + return 0; +} +EXPORT_SYMBOL_GPL(synth_event_trace_end); + +static int create_synth_event(int argc, const char **argv) +{ + const char *name = argv[0]; + int len; + + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + + /* This interface accepts group name prefix */ + if (strchr(name, '/')) { + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) + return -EINVAL; + name += len; + } + return __create_synth_event(argc - 1, name, argv + 1); +} + +static int synth_event_release(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + int ret; + + if (event->ref) + return -EBUSY; + + ret = unregister_synth_event(event); + if (ret) + return ret; + + dyn_event_remove(ev); + free_synth_event(event); + return 0; +} + +static int __synth_event_show(struct seq_file *m, struct synth_event *event) +{ + struct synth_field *field; + unsigned int i; + + seq_printf(m, "%s\t", event->name); + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + + /* parameter values */ + seq_printf(m, "%s %s%s", field->type, field->name, + i == event->n_fields - 1 ? "" : "; "); + } + + seq_putc(m, '\n'); + + return 0; +} + +static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + seq_printf(m, "s:%s/", event->class.system); + + return __synth_event_show(m, event); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_synth_event(ev)) + return 0; + + return __synth_event_show(m, to_synth_event(ev)); +} + +static const struct seq_operations synth_events_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = synth_events_seq_show, +}; + +static int synth_events_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(&synth_event_ops); + if (ret < 0) + return ret; + } + + return seq_open(file, &synth_events_seq_op); +} + +static ssize_t synth_events_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_synth_event); +} + +static const struct file_operations synth_events_fops = { + .open = synth_events_open, + .write = synth_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int trace_events_synth_init(void) +{ + struct dentry *entry = NULL; + struct dentry *d_tracer; + int err = 0; + + err = dyn_event_register(&synth_event_ops); + if (err) { + pr_warn("Could not register synth_event_ops\n"); + return err; + } + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) { + err = PTR_ERR(d_tracer); + goto err; + } + + entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + NULL, &synth_events_fops); + if (!entry) { + err = -ENODEV; + goto err; + } + + return err; + err: + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); + + return err; +} + +fs_initcall(trace_events_synth_init); diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h new file mode 100644 index 000000000000..ac35c45207c4 --- /dev/null +++ b/kernel/trace/trace_synth.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_SYNTH_H +#define __TRACE_SYNTH_H + +#include "trace_dynevent.h" + +#define SYNTH_SYSTEM "synthetic" +#define SYNTH_FIELDS_MAX 32 + +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ + +struct synth_field { + char *type; + char *name; + size_t size; + unsigned int offset; + bool is_signed; + bool is_string; +}; + +struct synth_event { + struct dyn_event devent; + int ref; + char *name; + struct synth_field **fields; + unsigned int n_fields; + unsigned int n_u64; + struct trace_event_class class; + struct trace_event_call call; + struct tracepoint *tp; + struct module *mod; +}; + +extern struct synth_event *find_synth_event(const char *name); + +#endif /* __TRACE_SYNTH_H */ -- cgit v1.2.3 From c200784a08d4ea82f82a30678955b7f2c7550af4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 29 May 2020 10:46:32 -0400 Subject: tracing: Add a trace print when traceoff_on_warning is triggered When "traceoff_on_warning" is enabled and a warning happens, there can still be many trace events happening on other CPUs between the time the warning occurred and the last trace event on that same CPU. This can cause confusion in examining the trace, as it may not be obvious where the warning happened. By adding a trace print into the trace just before disabling tracing, it makes it obvious where the warning occurred, and the developer doesn't have to look at other means to see what CPU it occurred on. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..760fd102dbe2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1299,8 +1299,11 @@ EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { - if (__disable_trace_on_warning) + if (__disable_trace_on_warning) { + trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, + "Disabling tracing due to warning\n"); tracing_off(); + } } /** -- cgit v1.2.3 From 10cdb15759540f03d056e2f73fe26377ed7dcfda Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 1 Jun 2020 08:44:40 +0000 Subject: workqueue: use BUILD_BUG_ON() for compile time test instead of WARN_ON() Any runtime WARN_ON() has to be fixed, and BUILD_BUG_ON() can help you nitice it earlier. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c667ca5aed61..9fbe1e237563 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5917,7 +5917,7 @@ void __init workqueue_init_early(void) int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); -- cgit v1.2.3 From fe537393b5795ecbe5746eec0e16124bc998a594 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 25 May 2020 14:29:28 +0200 Subject: bpf: Fix returned error sign when link doesn't support updates System calls encode returned errors as negative values. Fix a typo that breaks this convention for bpf(LINK_UPDATE) when bpf_link doesn't support update operation. Fixes: f9d041271cf4 ("bpf: Refactor bpf_link update handling") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525122928.1164495-1-jakub@cloudflare.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aaa29fb6f363..d13b804ff045 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3924,7 +3924,7 @@ static int link_update(union bpf_attr *attr) if (link->ops->update_prog) ret = link->ops->update_prog(link, new_prog, old_prog); else - ret = EINVAL; + ret = -EINVAL; out_put_progs: if (old_prog) -- cgit v1.2.3 From 0142dddcbe965450338076c486d0d757b3184352 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Tue, 26 May 2020 11:00:24 +1200 Subject: bpf: Fix spelling in comment explaining ARG1 in ___bpf_prog_run Change 'handeled' to 'handled'. Signed-off-by: Chris Packham Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525230025.14470-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c40ff4cf9880..af52ca658c73 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1543,7 +1543,7 @@ select_insn: /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is - * handeled like a helper, that is, bpf_tail_call_proto, + * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; -- cgit v1.2.3 From f470378c7562a2818b45ed11c98973f2b89eedd3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:50:55 -0700 Subject: bpf: Extend bpf_base_func_proto helpers with probe_* and *current_task* Often it is useful when applying policy to know something about the task. If the administrator has CAP_SYS_ADMIN rights then they can use kprobe + networking hook and link the two programs together to accomplish this. However, this is a bit clunky and also means we have to call both the network program and kprobe program when we could just use a single program and avoid passing metadata through sk_msg/skb->cb, socket, maps, etc. To accomplish this add probe_* helpers to bpf_base_func_proto programs guarded by a perfmon_capable() check. New supported helpers are the following, BPF_FUNC_get_current_task BPF_FUNC_probe_read_user BPF_FUNC_probe_read_kernel BPF_FUNC_probe_read_user_str BPF_FUNC_probe_read_kernel_str Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033905529.12355.4368381069655254932.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 24 ++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 10 +++++----- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 886949fdcece..bb4fb634275e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,12 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; + const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -647,6 +653,24 @@ bpf_base_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + default: + break; + } + + if (!perfmon_capable()) + return NULL; + + switch (func_id) { + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9531f54d0a3a..187cd6995bbb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -147,7 +147,7 @@ BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_proto = { +const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, .ret_type = RET_INTEGER, @@ -167,7 +167,7 @@ BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_str_proto = { +const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -198,7 +198,7 @@ BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_proto = { .func = bpf_probe_read_kernel, .gpl_only = true, .ret_type = RET_INTEGER, @@ -253,7 +253,7 @@ BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .func = bpf_probe_read_kernel_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -907,7 +907,7 @@ BPF_CALL_0(bpf_get_current_task) return (long) current; } -static const struct bpf_func_proto bpf_get_current_task_proto = { +const struct bpf_func_proto bpf_get_current_task_proto = { .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, -- cgit v1.2.3 From 1ea0f9120c8ce105ca181b070561df5cbd6bc049 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:59 +0000 Subject: bpf: Fix map permissions check The map_lookup_and_delete_elem() function should check for both FMODE_CAN_WRITE and FMODE_CAN_READ permissions because it returns a map element to user space. Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-5-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d13b804ff045..2c969a9b90d3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1472,7 +1472,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } -- cgit v1.2.3 From 457f44363a8894135c85b7a9afd2bd8196db24ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:20 -0700 Subject: bpf: Implement BPF ring buffer and verifier support for it This commit adds a new MPSC ring buffer implementation into BPF ecosystem, which allows multiple CPUs to submit data to a single shared ring buffer. On the consumption side, only single consumer is assumed. Motivation ---------- There are two distinctive motivators for this work, which are not satisfied by existing perf buffer, which prompted creation of a new ring buffer implementation. - more efficient memory utilization by sharing ring buffer across CPUs; - preserving ordering of events that happen sequentially in time, even across multiple CPUs (e.g., fork/exec/exit events for a task). These two problems are independent, but perf buffer fails to satisfy both. Both are a result of a choice to have per-CPU perf ring buffer. Both can be also solved by having an MPSC implementation of ring buffer. The ordering problem could technically be solved for perf buffer with some in-kernel counting, but given the first one requires an MPSC buffer, the same solution would solve the second problem automatically. Semantics and APIs ------------------ Single ring buffer is presented to BPF programs as an instance of BPF map of type BPF_MAP_TYPE_RINGBUF. Two other alternatives considered, but ultimately rejected. One way would be to, similar to BPF_MAP_TYPE_PERF_EVENT_ARRAY, make BPF_MAP_TYPE_RINGBUF could represent an array of ring buffers, but not enforce "same CPU only" rule. This would be more familiar interface compatible with existing perf buffer use in BPF, but would fail if application needed more advanced logic to lookup ring buffer by arbitrary key. HASH_OF_MAPS addresses this with current approach. Additionally, given the performance of BPF ringbuf, many use cases would just opt into a simple single ring buffer shared among all CPUs, for which current approach would be an overkill. Another approach could introduce a new concept, alongside BPF map, to represent generic "container" object, which doesn't necessarily have key/value interface with lookup/update/delete operations. This approach would add a lot of extra infrastructure that has to be built for observability and verifier support. It would also add another concept that BPF developers would have to familiarize themselves with, new syntax in libbpf, etc. But then would really provide no additional benefits over the approach of using a map. BPF_MAP_TYPE_RINGBUF doesn't support lookup/update/delete operations, but so doesn't few other map types (e.g., queue and stack; array doesn't support delete, etc). The approach chosen has an advantage of re-using existing BPF map infrastructure (introspection APIs in kernel, libbpf support, etc), being familiar concept (no need to teach users a new type of object in BPF program), and utilizing existing tooling (bpftool). For common scenario of using a single ring buffer for all CPUs, it's as simple and straightforward, as would be with a dedicated "container" object. On the other hand, by being a map, it can be combined with ARRAY_OF_MAPS and HASH_OF_MAPS map-in-maps to implement a wide variety of topologies, from one ring buffer for each CPU (e.g., as a replacement for perf buffer use cases), to a complicated application hashing/sharding of ring buffers (e.g., having a small pool of ring buffers with hashed task's tgid being a look up key to preserve order, but reduce contention). Key and value sizes are enforced to be zero. max_entries is used to specify the size of ring buffer and has to be a power of 2 value. There are a bunch of similarities between perf buffer (BPF_MAP_TYPE_PERF_EVENT_ARRAY) and new BPF ring buffer semantics: - variable-length records; - if there is no more space left in ring buffer, reservation fails, no blocking; - memory-mappable data area for user-space applications for ease of consumption and high performance; - epoll notifications for new incoming data; - but still the ability to do busy polling for new data to achieve the lowest latency, if necessary. BPF ringbuf provides two sets of APIs to BPF programs: - bpf_ringbuf_output() allows to *copy* data from one place to a ring buffer, similarly to bpf_perf_event_output(); - bpf_ringbuf_reserve()/bpf_ringbuf_commit()/bpf_ringbuf_discard() APIs split the whole process into two steps. First, a fixed amount of space is reserved. If successful, a pointer to a data inside ring buffer data area is returned, which BPF programs can use similarly to a data inside array/hash maps. Once ready, this piece of memory is either committed or discarded. Discard is similar to commit, but makes consumer ignore the record. bpf_ringbuf_output() has disadvantage of incurring extra memory copy, because record has to be prepared in some other place first. But it allows to submit records of the length that's not known to verifier beforehand. It also closely matches bpf_perf_event_output(), so will simplify migration significantly. bpf_ringbuf_reserve() avoids the extra copy of memory by providing a memory pointer directly to ring buffer memory. In a lot of cases records are larger than BPF stack space allows, so many programs have use extra per-CPU array as a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs completely. But in exchange, it only allows a known constant size of memory to be reserved, such that verifier can verify that BPF program can't access memory outside its reserved record space. bpf_ringbuf_output(), while slightly slower due to extra memory copy, covers some use cases that are not suitable for bpf_ringbuf_reserve(). The difference between commit and discard is very small. Discard just marks a record as discarded, and such records are supposed to be ignored by consumer code. Discard is useful for some advanced use-cases, such as ensuring all-or-nothing multi-record submission, or emulating temporary malloc()/free() within single BPF program invocation. Each reserved record is tracked by verifier through existing reference-tracking logic, similar to socket ref-tracking. It is thus impossible to reserve a record, but forget to submit (or discard) it. bpf_ringbuf_query() helper allows to query various properties of ring buffer. Currently 4 are supported: - BPF_RB_AVAIL_DATA returns amount of unconsumed data in ring buffer; - BPF_RB_RING_SIZE returns the size of ring buffer; - BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical possition of consumer/producer, respectively. Returned values are momentarily snapshots of ring buffer state and could be off by the time helper returns, so this should be used only for debugging/reporting reasons or for implementing various heuristics, that take into account highly-changeable nature of some of those characteristics. One such heuristic might involve more fine-grained control over poll/epoll notifications about new data availability in ring buffer. Together with BPF_RB_NO_WAKEUP/BPF_RB_FORCE_WAKEUP flags for output/commit/discard helpers, it allows BPF program a high degree of control and, e.g., more efficient batched notifications. Default self-balancing strategy, though, should be adequate for most applications and will work reliable and efficiently already. Design and implementation ------------------------- This reserve/commit schema allows a natural way for multiple producers, either on different CPUs or even on the same CPU/in the same BPF program, to reserve independent records and work with them without blocking other producers. This means that if BPF program was interruped by another BPF program sharing the same ring buffer, they will both get a record reserved (provided there is enough space left) and can work with it and submit it independently. This applies to NMI context as well, except that due to using a spinlock during reservation, in NMI context, bpf_ringbuf_reserve() might fail to get a lock, in which case reservation will fail even if ring buffer is not full. The ring buffer itself internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters (which might wrap around on 32-bit architectures, that's not a problem): - consumer counter shows up to which logical position consumer consumed the data; - producer counter denotes amount of data reserved by all producers. Each time a record is reserved, producer that "owns" the record will successfully advance producer counter. At that point, data is still not yet ready to be consumed, though. Each record has 8 byte header, which contains the length of reserved record, as well as two extra bits: busy bit to denote that record is still being worked on, and discard bit, which might be set at commit time if record is discarded. In the latter case, consumer is supposed to skip the record and move on to the next one. Record header also encodes record's relative offset from the beginning of ring buffer data area (in pages). This allows bpf_ringbuf_commit()/bpf_ringbuf_discard() to accept only the pointer to the record itself, without requiring also the pointer to ring buffer itself. Ring buffer memory location will be restored from record metadata header. This significantly simplifies verifier, as well as improving API usability. Producer counter increments are serialized under spinlock, so there is a strict ordering between reservations. Commits, on the other hand, are completely lockless and independent. All records become available to consumer in the order of reservations, but only after all previous records where already committed. It is thus possible for slow producers to temporarily hold off submitted records, that were reserved later. Reservation/commit/consumer protocol is verified by litmus tests in Documentation/litmus-test/bpf-rb. One interesting implementation bit, that significantly simplifies (and thus speeds up as well) implementation of both producers and consumers is how data area is mapped twice contiguously back-to-back in the virtual memory. This allows to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. See comment and a simple ASCII diagram showing this visually in bpf_ringbuf_area_alloc(). Another feature that distinguishes BPF ringbuf from perf ring buffer is a self-pacing notifications of new data being availability. bpf_ringbuf_commit() implementation will send a notification of new record being available after commit only if consumer has already caught up right up to the record being committed. If not, consumer still has to catch up and thus will see new data anyways without needing an extra poll notification. Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c) show that this allows to achieve a very high throughput without having to resort to tricks like "notify only every Nth sample", which are necessary with perf buffer. For extreme cases, when BPF program wants more manual control of notifications, commit/discard/output helpers accept BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP flags, which give full control over notifications of data availability, but require extra caution and diligence in using this API. Comparison to alternatives -------------------------- Before considering implementing BPF ring buffer from scratch existing alternatives in kernel were evaluated, but didn't seem to meet the needs. They largely fell into few categores: - per-CPU buffers (perf, ftrace, etc), which don't satisfy two motivations outlined above (ordering and memory consumption); - linked list-based implementations; while some were multi-producer designs, consuming these from user-space would be very complicated and most probably not performant; memory-mapping contiguous piece of memory is simpler and more performant for user-space consumers; - io_uring is SPSC, but also requires fixed-sized elements. Naively turning SPSC queue into MPSC w/ lock would have subpar performance compared to locked reserve + lockless commit, as with BPF ring buffer. Fixed sized elements would be too limiting for BPF programs, given existing BPF programs heavily rely on variable-sized perf buffer already; - specialized implementations (like a new printk ring buffer, [0]) with lots of printk-specific limitations and implications, that didn't seem to fit well for intended use with BPF programs. [0] https://lwn.net/Articles/779550/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-2-andriin@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 10 + kernel/bpf/ringbuf.c | 501 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 12 ++ kernel/bpf/verifier.c | 195 +++++++++++++----- kernel/trace/bpf_trace.c | 10 + 6 files changed, 680 insertions(+), 50 deletions(-) create mode 100644 kernel/bpf/ringbuf.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 375b933010dd..8fca02f64811 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4fb634275e..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..180414bb0d3e --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,501 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + if (!data_sz || !PAGE_ALIGNED(data_sz)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (data_sz > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c969a9b90d3..9de3540fa90c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -662,6 +663,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d725a26f66e..5c7bbaac81ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -408,7 +409,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL; + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -422,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -436,7 +440,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } static bool may_be_acquire_function(enum bpf_func_id func_id) @@ -444,7 +450,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem; + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; } static bool is_acquire_function(enum bpf_func_id func_id, @@ -454,7 +461,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp) + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -494,6 +502,8 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", }; static char slot_type_char[] = { @@ -2468,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; + + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2514,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2527,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } + + return 0; +} - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2596,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2631,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -3220,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3557,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3661,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3720,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3791,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3893,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3929,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -4655,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -6611,6 +6706,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_TCP_SOCK; } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; } if (is_null) { /* We don't need id and ref_obj_id from this point diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187cd6995bbb..3767d34114c0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: return NULL; } -- cgit v1.2.3 From b36e62eb85215a60916f910070f6d494b4f3e73a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 28 May 2020 17:48:10 -0700 Subject: bpf: Use strncpy_from_unsafe_strict() in bpf_seq_printf() helper In bpf_seq_printf() helper, when user specified a "%s" in the format string, strncpy_from_unsafe() is used to read the actual string to a buffer. The string could be a format string or a string in the kernel data structure. It is really unlikely that the string will reside in the user memory. This is different from Commit b2a5212fb634 ("bpf: Restrict bpf_trace_printk()'s %s usage and add %pks, %pus specifier") which still used strncpy_from_unsafe() for "%s" to preserve the old behavior. If in the future, bpf_seq_printf() indeed needs to read user memory, we can implement "%pus" format string. Based on discussion in [1], if the intent is to read kernel memory, strncpy_from_unsafe_strict() should be used. So this patch changed to use strncpy_from_unsafe_strict(). [1]: https://lore.kernel.org/bpf/20200521152301.2587579-1-hch@lst.de/T/ Fixes: 492e639f0c22 ("bpf: Add bpf_seq_printf and bpf_seq_write helpers") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200529004810.3352219-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3767d34114c0..b6c24be5ff53 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -585,9 +585,9 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, goto out; } - err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - MAX_SEQ_PRINTF_STR_LEN); + err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); if (err < 0) bufs->buf[memcpy_cnt][0] = '\0'; params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; -- cgit v1.2.3 From 7f1c04269fe7b3293dea38ea65da4fd6614d6f80 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:12 -0600 Subject: devmap: Formalize map value as a named struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 'struct bpf_devmap_val' to formalize the expected values that can be passed in for a DEVMAP. Update devmap code to use the struct. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-2-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a51d9fb7a359..a1459de0914e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,12 +60,18 @@ struct xdp_dev_bulk_queue { unsigned int count; }; +/* DEVMAP values */ +struct bpf_devmap_val { + u32 ifindex; /* device index */ +}; + struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; struct rcu_head rcu; unsigned int idx; + struct bpf_devmap_val val; }; struct bpf_dtab { @@ -472,18 +478,15 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) @@ -541,7 +544,7 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, - u32 ifindex, + struct bpf_devmap_val *val, unsigned int idx) { struct bpf_dtab_netdev *dev; @@ -551,16 +554,18 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev) return ERR_PTR(-ENOMEM); - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - kfree(dev); - return ERR_PTR(-EINVAL); - } + dev->dev = dev_get_by_index(net, val->ifindex); + if (!dev->dev) + goto err_out; dev->idx = idx; dev->dtab = dtab; + dev->val.ifindex = val->ifindex; return dev; +err_out: + kfree(dev); + return ERR_PTR(-EINVAL); } static int __dev_map_update_elem(struct net *net, struct bpf_map *map, @@ -568,7 +573,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -578,10 +583,13 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (!ifindex) { + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (!val.ifindex) { dev = NULL; } else { - dev = __dev_map_alloc_node(net, dtab, ifindex, i); + dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } @@ -609,12 +617,15 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; - if (unlikely(map_flags > BPF_EXIST || !ifindex)) + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); @@ -623,7 +634,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; - dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; -- cgit v1.2.3 From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:13 -0600 Subject: bpf: Add support to attach bpf program to a devmap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add BPF_XDP_DEVMAP attach type for use with programs associated with a DEVMAP entry. Allow DEVMAPs to associate a program with a device entry by adding a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program id, so the fd and id are a union. bpf programs can get access to the struct via vmlinux.h. The program associated with the fd must have type XDP with expected attach type BPF_XDP_DEVMAP. When a program is associated with a device index, the program is run on an XDP_REDIRECT and before the buffer is added to the per-cpu queue. At this point rxq data is still valid; the next patch adds tx device information allowing the prorgam to see both ingress and egress device indices. XDP generic is skb based and XDP programs do not work with skb's. Block the use case by walking maps used by a program that is to be attached via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with Block attach of BPF_XDP_DEVMAP programs to devices. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a1459de0914e..0089d56617ec 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue { /* DEVMAP values */ struct bpf_devmap_val { u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; @@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); @@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) @@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; -- cgit v1.2.3 From 64b59025c15b244c0954cf52b24fbabfcf5ed8f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:14 -0600 Subject: xdp: Add xdp_txq_info to xdp_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the moment only the device is added. Other fields (queue_index) can be added as use cases arise. >From a UAPI perspective, add egress_ifindex to xdp context for bpf programs to see the Tx device. Update the verifier to only allow accesses to egress_ifindex by XDP programs with BPF_XDP_DEVMAP expected attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-4-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0089d56617ec..c04fb1c72f5e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -476,8 +476,11 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp->txq = &txq; + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: -- cgit v1.2.3 From bb2359f4dbe98e8863b4e885fc09269ef4682ec3 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 1 Jun 2020 19:28:14 +0300 Subject: bpf: Change kvfree to kfree in generic_map_lookup_batch() buf_prevkey in generic_map_lookup_batch() is allocated with kmalloc(). It's safe to free it with kfree(). Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op") Signed-off-by: Denis Efremov Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200601162814.17426-1-efremov@linux.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9de3540fa90c..e83b0818b529 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1399,7 +1399,7 @@ int generic_map_lookup_batch(struct bpf_map *map, buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { - kvfree(buf_prevkey); + kfree(buf_prevkey); return -ENOMEM; } -- cgit v1.2.3 From 1b698fa5d8ef958007c455e316aa44c37ab3c5fb Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 28 May 2020 22:47:29 +0200 Subject: xdp: Rename convert_to_xdp_frame in xdp_convert_buff_to_frame In order to use standard 'xdp' prefix, rename convert_to_xdp_frame utility routine in xdp_convert_buff_to_frame and replace all the occurrences Signed-off-by: Lorenzo Bianconi Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/6344f739be0d1a08ab2b9607584c4d5478c8c083.1590698295.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 2 +- kernel/bpf/devmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8b85bfddfac7..27595fc6da56 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -621,7 +621,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, { struct xdp_frame *xdpf; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c04fb1c72f5e..854b09beb16b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -465,7 +465,7 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(err)) return err; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; -- cgit v1.2.3 From 958a3f2d2aff896ae2a622878e456114f4a4cd15 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 31 May 2020 17:42:55 +0200 Subject: bpf: Use tracing helpers for lsm programs Currenty lsm uses bpf_tracing_func_proto helpers which do not include stack trace or perf event output. It's useful to have those for bpftrace lsm support [1]. Using tracing_prog_func_proto helpers for lsm programs. [1] https://github.com/iovisor/bpftrace/pull/1347 Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Cc: KP Singh Link: https://lore.kernel.org/bpf/20200531154255.896551-1-jolsa@kernel.org --- kernel/bpf/bpf_lsm.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 19636703b24e..fb278144e9fd 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = bpf_tracing_func_proto, + .get_func_proto = tracing_prog_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b6c24be5ff53..c41186417d93 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1467,7 +1467,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * +const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { -- cgit v1.2.3 From a3fd7ceee05431d2c51ed86c6cae015d236a51f0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:36 +0200 Subject: net: Introduce netns_bpf for BPF programs attached to netns In order to: (1) attach more than one BPF program type to netns, or (2) support attaching BPF programs to netns with bpf_link, or (3) support multi-prog attach points for netns we will need to keep more state per netns than a single pointer like we have now for BPF flow dissector program. Prepare for the above by extracting netns_bpf that is part of struct net, for storing all state related to BPF programs attached to netns. Turn flow dissector callbacks for querying/attaching/detaching a program into generic ones that operate on netns_bpf. Next patch will move the generic callbacks into their own module. This is similar to how it is organized for cgroup with cgroup_bpf. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Cc: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-3-jakub@cloudflare.com --- kernel/bpf/syscall.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e83b0818b529..c77ab9c76f7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2868,7 +2869,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2908,7 +2909,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return skb_flow_dissector_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2961,7 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_prog_query(attr, uattr); + return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; } -- cgit v1.2.3 From b27f7bb590ba835b32ef122389db158e44cfda1e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:37 +0200 Subject: flow_dissector: Move out netns_bpf prog callbacks Move functions to manage BPF programs attached to netns that are not specific to flow dissector to a dedicated module named bpf/net_namespace.c. The set of functions will grow with the addition of bpf_link support for netns attached programs. This patch prepares ground by creating a place for it. This is a code move with no functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-4-jakub@cloudflare.com --- kernel/bpf/Makefile | 1 + kernel/bpf/net_namespace.c | 133 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 kernel/bpf/net_namespace.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 8fca02f64811..1131a921e1a6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,6 +13,7 @@ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o +obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c new file mode 100644 index 000000000000..b37d81450c3a --- /dev/null +++ b/kernel/bpf/net_namespace.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +/* + * Functions to manage BPF programs attached to netns + */ + +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); + +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->bpf.progs[type]); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type type; + struct net *net; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *attached; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) + return -ENOENT; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + bpf_prog_put(attached); + return 0; +} + +int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) +{ + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); +} + +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, +}; + +static int __init netns_bpf_init(void) +{ + return register_pernet_subsys(&netns_bpf_pernet_ops); +} + +subsys_initcall(netns_bpf_init); -- cgit v1.2.3 From 7f045a49fee04b5662cbdeaf0838f9322ae8c63a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:38 +0200 Subject: bpf: Add link-based BPF program attachment to network namespace Extend bpf() syscall subcommands that operate on bpf_link, that is LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO, to accept attach types tied to network namespaces (only flow dissector at the moment). Link-based and prog-based attachment can be used interchangeably, but only one can exist at a time. Attempts to attach a link when a prog is already attached directly, and the other way around, will be met with -EEXIST. Attempts to detach a program when link exists result in -EINVAL. Attachment of multiple links of same attach type to one netns is not supported with the intention to lift the restriction when a use-case presents itself. Because of that link create returns -E2BIG when trying to create another netns link, when one already exists. Link-based attachments to netns don't keep a netns alive by holding a ref to it. Instead links get auto-detached from netns when the latter is being destroyed, using a pernet pre_exit callback. When auto-detached, link lives in defunct state as long there are open FDs for it. -ENOLINK is returned if a user tries to update a defunct link. Because bpf_link to netns doesn't hold a ref to struct net, special care is taken when releasing, updating, or filling link info. The netns might be getting torn down when any of these link operations are in progress. That is why auto-detach and update/release/fill_info are synchronized by the same mutex. Also, link ops have to always check if auto-detach has not happened yet and if netns is still alive (refcnt > 0). Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-5-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 244 ++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 3 + 2 files changed, 245 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b37d81450c3a..78cf061f8179 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -8,9 +8,140 @@ * Functions to manage BPF programs attached to netns */ +struct bpf_netns_link { + struct bpf_link link; + enum bpf_attach_type type; + enum netns_bpf_attach_type netns_type; + + /* We don't hold a ref to net in order to auto-detach the link + * when netns is going away. Instead we rely on pernet + * pre_exit callback to clear this pointer. Must be accessed + * with netns_bpf_mutex held. + */ + struct net *net; +}; + /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +/* Must be called with netns_bpf_mutex held. */ +static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + net_link->net = NULL; +} + +static void bpf_netns_link_release(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + + /* Link auto-detached by dying netns. */ + if (!net_link->net) + return; + + mutex_lock(&netns_bpf_mutex); + + /* Recheck after potential sleep. We can race with cleanup_net + * here, but if we see a non-NULL struct net pointer pre_exit + * has not happened yet and will block on netns_bpf_mutex. + */ + net = net_link->net; + if (!net) + goto out_unlock; + + net->bpf.links[type] = NULL; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); +} + +static void bpf_netns_link_dealloc(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + kfree(net_link); +} + +static int bpf_netns_link_update_prog(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + int ret = 0; + + if (old_prog && old_prog != link->prog) + return -EPERM; + if (new_prog->type != link->prog->type) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + + net = net_link->net; + if (!net || !check_net(net)) { + /* Link auto-detached or netns dying */ + ret = -ENOLINK; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + rcu_assign_pointer(net->bpf.progs[type], new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return ret; +} + +static int bpf_netns_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + unsigned int inum = 0; + struct net *net; + + mutex_lock(&netns_bpf_mutex); + net = net_link->net; + if (net && check_net(net)) + inum = net->ns.inum; + mutex_unlock(&netns_bpf_mutex); + + info->netns.netns_ino = inum; + info->netns.attach_type = net_link->type; + return 0; +} + +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_link_info info = {}; + + bpf_netns_link_fill_info(link, &info); + seq_printf(seq, + "netns_ino:\t%u\n" + "attach_type:\t%u\n", + info.netns.netns_ino, + info.netns.attach_type); +} + +static const struct bpf_link_ops bpf_netns_link_ops = { + .release = bpf_netns_link_release, + .dealloc = bpf_netns_link_dealloc, + .update_prog = bpf_netns_link_update_prog, + .fill_link_info = bpf_netns_link_fill_info, + .show_fdinfo = bpf_netns_link_show_fdinfo, +}; + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -67,6 +198,13 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); + + /* Attaching prog directly is not compatible with links */ + if (net->bpf.links[type]) { + ret = -EEXIST; + goto out_unlock; + } + switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach(net, prog); @@ -75,6 +213,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) ret = -EINVAL; break; } +out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; @@ -86,6 +225,10 @@ static int __netns_bpf_prog_detach(struct net *net, { struct bpf_prog *attached; + /* Progs attached via links cannot be detached */ + if (net->bpf.links[type]) + return -EINVAL; + attached = rcu_dereference_protected(net->bpf.progs[type], lockdep_is_held(&netns_bpf_mutex)); if (!attached) @@ -111,13 +254,110 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) return ret; } +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *prog; + int err; + + mutex_lock(&netns_bpf_mutex); + + /* Allow attaching only one prog or link for now */ + if (net->bpf.links[type]) { + err = -E2BIG; + goto out_unlock; + } + /* Links are not compatible with attaching prog directly */ + prog = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (prog) { + err = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + err = flow_dissector_bpf_prog_attach(net, link->prog); + break; + default: + err = -EINVAL; + break; + } + if (err) + goto out_unlock; + + net->bpf.links[type] = link; + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return err; +} + +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type netns_type; + struct bpf_link_primer link_primer; + struct bpf_netns_link *net_link; + enum bpf_attach_type type; + struct net *net; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + type = attr->link_create.attach_type; + netns_type = to_netns_bpf_attach_type(type); + if (netns_type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->link_create.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + net_link = kzalloc(sizeof(*net_link), GFP_USER); + if (!net_link) { + err = -ENOMEM; + goto out_put_net; + } + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, + &bpf_netns_link_ops, prog); + net_link->net = net; + net_link->type = type; + net_link->netns_type = netns_type; + + err = bpf_link_prime(&net_link->link, &link_primer); + if (err) { + kfree(net_link); + goto out_put_net; + } + + err = netns_bpf_link_attach(net, &net_link->link, netns_type); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_net; + } + + put_net(net); + return bpf_link_settle(&link_primer); + +out_put_net: + put_net(net); + return err; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; + struct bpf_link *link; mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + link = net->bpf.links[type]; + if (link) + bpf_netns_link_auto_detach(link); + else + __netns_bpf_prog_detach(net, type); + } mutex_unlock(&netns_bpf_mutex); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c77ab9c76f7b..e14a842d7e0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3887,6 +3887,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: ret = tracing_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = netns_bpf_link_create(attr, prog); + break; default: ret = -EINVAL; } -- cgit v1.2.3 From 0c047ecbb7bab4c1d2136f5f04bb47a66a9a12b8 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:39 +0200 Subject: bpf, cgroup: Return ENOLINK for auto-detached links on update Failure to update a bpf_link because it has been auto-detached by a dying cgroup currently results in EINVAL error, even though the arguments passed to bpf() syscall are not wrong. bpf_links attaching to netns in this case will return ENOLINK, which carries the message that the link is no longer attached to anything. Change cgroup bpf_links to do the same to keep the uAPI errors consistent. Fixes: 0c991ebc8c69 ("bpf: Implement bpf_prog replacement for an active bpf_cgroup_link") Suggested-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-6-jakub@cloudflare.com --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5c0e964105ac..fdf7836750a3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -595,7 +595,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, mutex_lock(&cgroup_mutex); /* link might have been auto-released by dying cgroup, so fail */ if (!cg_link->cgroup) { - ret = -EINVAL; + ret = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { -- cgit v1.2.3 From 25de110d148666752dc0e0da7a0b69de31cd7098 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 Jun 2020 12:08:39 +0200 Subject: irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too Some SMP platforms don't have CONFIG_IRQ_WORK defined, resulting in a link error at build time. Define a stub and clean up the prototype definitions. Reported-by: kbuild test robot Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4dec04f7fdc5..c80486a7e3b8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,8 +194,6 @@ void generic_smp_call_function_single_interrupt(void) flush_smp_call_function_queue(true); } -extern void irq_work_single(void *); - /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * -- cgit v1.2.3 From b1350132fef7e1f0ddfd5a985d516a6ed7a329fc Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 26 May 2020 14:20:06 -0700 Subject: kgdb: Don't call the deinit under spinlock When I combined kgdboc_earlycon with an inflight patch titled ("soc: qcom-geni-se: Add interconnect support to fix earlycon crash") [1] things went boom. Specifically I got a crash during the transition between kgdboc_earlycon and the main kgdboc that looked like this: Call trace: __schedule_bug+0x68/0x6c __schedule+0x75c/0x924 schedule+0x8c/0xbc schedule_timeout+0x9c/0xfc do_wait_for_common+0xd0/0x160 wait_for_completion_timeout+0x54/0x74 rpmh_write_batch+0x1fc/0x23c qcom_icc_bcm_voter_commit+0x1b4/0x388 qcom_icc_set+0x2c/0x3c apply_constraints+0x5c/0x98 icc_set_bw+0x204/0x3bc icc_put+0x30/0xf8 geni_remove_earlycon_icc_vote+0x6c/0x9c qcom_geni_serial_earlycon_exit+0x10/0x1c kgdboc_earlycon_deinit+0x38/0x58 kgdb_register_io_module+0x11c/0x194 configure_kgdboc+0x108/0x174 kgdboc_probe+0x38/0x60 platform_drv_probe+0x90/0xb0 really_probe+0x130/0x2fc ... The problem was that we were holding the "kgdb_registration_lock" while calling into code that didn't expect to be called in spinlock context. Let's slightly defer when we call the deinit code so that it's not done under spinlock. NOTE: this does mean that the "deinit" call of the old kgdb IO module is now made _after_ the init of the new IO module, but presumably that's OK. [1] https://lkml.kernel.org/r/1588919619-21355-3-git-send-email-akashast@codeaurora.org Fixes: 220995622da5 ("kgdboc: Add kgdboc_earlycon to support early kgdb using boot consoles") Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200526142001.1.I523dc33f96589cb9956f5679976d402c8cda36fa@changeid [daniel.thompson@linaro.org: Resolved merge issues by hand] Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 4d59aa907fdc..ef94e906f05a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1089,7 +1089,6 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) } pr_info("Replacing I/O driver %s with %s\n", old_dbg_io_ops->name, new_dbg_io_ops->name); - old_dbg_io_ops->deinit(); } if (new_dbg_io_ops->init) { @@ -1104,8 +1103,10 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - if (old_dbg_io_ops) + if (old_dbg_io_ops) { + old_dbg_io_ops->deinit(); return 0; + } pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); -- cgit v1.2.3 From 1b310030bb855b9b13d1c0a9feffdb54883b06ab Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 16:11:46 -0700 Subject: kdb: Cleanup math with KDB_CMD_HISTORY_COUNT From code inspection the math in handle_ctrl_cmd() looks super sketchy because it subjects -1 from cmdptr and then does a "% KDB_CMD_HISTORY_COUNT". It turns out that this code works because "cmdptr" is unsigned and KDB_CMD_HISTORY_COUNT is a nice power of 2. Let's make this a little less sketchy. This patch should be a no-op. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200507161125.1.I2cce9ac66e141230c3644b8174b6c15d4e769232@changeid Reviewed-by: Sumit Garg Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 515379cbf209..6865a0f58d38 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1108,7 +1108,8 @@ static int handle_ctrl_cmd(char *cmd) switch (*cmd) { case CTRL_P: if (cmdptr != cmd_tail) - cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; + cmdptr = (cmdptr + KDB_CMD_HISTORY_COUNT - 1) % + KDB_CMD_HISTORY_COUNT; strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); return 1; case CTRL_N: -- cgit v1.2.3 From c893de12e1ef17b581eb2cf8fc9018ec0cbd07df Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 21 May 2020 15:21:25 +0800 Subject: kdb: Remove the misfeature 'KDBFLAGS' Currently, 'KDBFLAGS' is an internal variable of kdb, it is combined by 'KDBDEBUG' and state flags. It will be shown only when 'KDBDEBUG' is set, and the user can define an environment variable named 'KDBFLAGS' too. These are puzzling indeed. After communication with Daniel, it seems that 'KDBFLAGS' is a misfeature. So let's replace 'KDBFLAGS' with 'KDBDEBUG' to just show the value we wrote into. After this modification, we can use `md4c1 kdb_flags` instead, to observe the state flags. Suggested-by: Daniel Thompson Signed-off-by: Wei Li Link: https://lore.kernel.org/r/20200521072125.21103-1-liwei391@huawei.com [daniel.thompson@linaro.org: Make kdb_flags unsigned to avoid arithmetic right shift] Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 6865a0f58d38..ec190569f690 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -62,7 +62,7 @@ int kdb_grep_trailing; /* * Kernel debugger state flags */ -int kdb_flags; +unsigned int kdb_flags; /* * kdb_lock protects updates to kdb_initial_cpu. Used to @@ -418,8 +418,7 @@ int kdb_set(int argc, const char **argv) argv[2]); return 0; } - kdb_flags = (kdb_flags & - ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) + kdb_flags = (kdb_flags & ~KDB_DEBUG(MASK)) | (debugflags << KDB_DEBUG_FLAG_SHIFT); return 0; @@ -2082,7 +2081,8 @@ static int kdb_env(int argc, const char **argv) } if (KDB_DEBUG(MASK)) - kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); + kdb_printf("KDBDEBUG=0x%x\n", + (kdb_flags & KDB_DEBUG(MASK)) >> KDB_DEBUG_FLAG_SHIFT); return 0; } -- cgit v1.2.3 From b3e2d20973db3ec87a6dd2fee0c88d3c2e7c2f61 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 17 Apr 2020 12:02:45 +0800 Subject: rcuperf: Fix printk format warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using "%zu" to fix following warning, kernel/rcu/rcuperf.c: In function ‘kfree_perf_init’: include/linux/kern_levels.h:5:18: warning: format ‘%lu’ expects argument of type ‘long unsigned int’, but argument 2 has type ‘unsigned int’ [-Wformat=] Signed-off-by: Kefeng Wang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..9eb39c20082c 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -723,7 +723,7 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } - pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj)); kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); -- cgit v1.2.3 From a37b0715ddf3007734c4e2424c14bc7efcdd1190 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Jun 2020 21:48:18 -0700 Subject: mm/writeback: replace PF_LESS_THROTTLE with PF_LOCAL_THROTTLE PF_LESS_THROTTLE exists for loop-back nfsd (and a similar need in the loop block driver and callers of prctl(PR_SET_IO_FLUSHER)), where a daemon needs to write to one bdi (the final bdi) in order to free up writes queued to another bdi (the client bdi). The daemon sets PF_LESS_THROTTLE and gets a larger allowance of dirty pages, so that it can still dirty pages after other processses have been throttled. The purpose of this is to avoid deadlock that happen when the PF_LESS_THROTTLE process must write for any dirty pages to be freed, but it is being thottled and cannot write. This approach was designed when all threads were blocked equally, independently on which device they were writing to, or how fast it was. Since that time the writeback algorithm has changed substantially with different threads getting different allowances based on non-trivial heuristics. This means the simple "add 25%" heuristic is no longer reliable. The important issue is not that the daemon needs a *larger* dirty page allowance, but that it needs a *private* dirty page allowance, so that dirty pages for the "client" bdi that it is helping to clear (the bdi for an NFS filesystem or loop block device etc) do not affect the throttling of the daemon writing to the "final" bdi. This patch changes the heuristic so that the task is not throttled when the bdi it is writing to has a dirty page count below below (or equal to) the free-run threshold for that bdi. This ensures it will always be able to have some pages in flight, and so will not deadlock. In a steady-state, it is expected that PF_LOCAL_THROTTLE tasks might still be throttled by global threshold, but that is acceptable as it is only the deadlock state that is interesting for this flag. This approach of "only throttle when target bdi is busy" is consistent with the other use of PF_LESS_THROTTLE in current_may_throttle(), were it causes attention to be focussed only on the target bdi. So this patch - renames PF_LESS_THROTTLE to PF_LOCAL_THROTTLE, - removes the 25% bonus that that flag gives, and - If PF_LOCAL_THROTTLE is set, don't delay at all unless the global and the local free-run thresholds are exceeded. Note that previously realtime threads were treated the same as PF_LESS_THROTTLE threads. This patch does *not* change the behvaiour for real-time threads, so it is now different from the behaviour of nfsd and loop tasks. I don't know what is wanted for realtime. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: NeilBrown Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Acked-by: Chuck Lever [nfsd] Cc: Christoph Hellwig Cc: Michal Hocko Cc: Trond Myklebust Link: http://lkml.kernel.org/r/87ftbf7gs3.fsf@notabene.neil.brown.name Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..180a2fa33f7f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2262,7 +2262,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) -- cgit v1.2.3 From 515e5b6d90d410a3b0b433853c367936830a45a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:32 -0700 Subject: dma-mapping: use vmap insted of reimplementing it Replace the open coded instance of vmap with the actual function. In the non-contiguous (IOMMU) case this requires an extra find_vm_area, but given that this isn't a fast path function that is a small price to pay. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-6-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/dma/remap.c | 48 ++++++++++++------------------------------------ 1 file changed, 12 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..914ff5a58dd5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -20,23 +20,6 @@ struct page **dma_common_find_pages(void *cpu_addr) return area->pages; } -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, pgprot_t prot, const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - /* * Remaps an array of PAGE_SIZE pages into another vm_area. * Cannot be used in non-sleeping contexts @@ -44,15 +27,12 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { - struct vm_struct *area; + void *vaddr; - area = __dma_common_pages_remap(pages, size, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; + vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + if (vaddr) + find_vm_area(vaddr)->pages = pages; + return vaddr; } /* @@ -62,24 +42,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int i; + int count = size >> PAGE_SHIFT; struct page **pages; - struct vm_struct *area; + void *vaddr; + int i; - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) + for (i = 0; i < count; i++) pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, prot, caller); - + vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kfree(pages); - if (!area) - return NULL; - return area->addr; + return vaddr; } /* -- cgit v1.2.3 From 88dca4ca5a93d2c09e5bbc6a62fbfc3af83c4fca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:40 -0700 Subject: mm: remove the pgprot argument to __vmalloc The pgprot argument to __vmalloc is always PAGE_KERNEL now, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Michael Kelley [hyperv] Acked-by: Gao Xiang [erofs] Acked-by: Peter Zijlstra (Intel) Acked-by: Wei Liu Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-22-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/bpf/core.c | 6 +++--- kernel/groups.c | 2 +- kernel/module.c | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 14aa1f74dd10..cf6fe9107f5c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..086618a0058f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2946,8 +2946,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; -- cgit v1.2.3 From 2b9059489c839e67ca9254913325e18cea11a980 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:53 -0700 Subject: mm: remove __vmalloc_node_flags_caller Just use __vmalloc_node instead which gets and extra argument. To be able to to use __vmalloc_node in all caller make it available outside of vmalloc and implement it in nommu.c. [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200414131348.444715-25-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/bpf/syscall.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e6dee19a668..9c1cf7a87fb3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -299,9 +299,8 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags); } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, + numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) -- cgit v1.2.3 From 041de93ff86fc500aa73e5360039c95f4d31b95f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:52:02 -0700 Subject: mm: remove vmalloc_user_node_flags Open code it in __bpf_map_area_alloc, which is the only caller. Also clean up __bpf_map_area_alloc to have a single vmalloc call with slightly different flags instead of the current two different calls. For this to compile for the nommu case add a __vmalloc_node_range stub to nommu.c. [akpm@linux-foundation.org: fix nommu.c build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200414131348.444715-27-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/bpf/syscall.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9c1cf7a87fb3..42c7a42fc9c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -281,26 +282,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, - numa_node, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) -- cgit v1.2.3 From 73f693c3a705756032c2863bfb37570276902d7d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:36 -0700 Subject: mm: remove vmalloc_sync_(un)mappings() These functions are not needed anymore because the vmalloc and ioremap mappings are now synchronized when they are created or torn down. Remove all callers and function definitions. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Tested-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-7-joro@8bytes.org Signed-off-by: Linus Torvalds --- kernel/notifier.c | 1 - kernel/trace/trace.c | 12 ------------ 2 files changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..f12e99b387b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8527,18 +8527,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } -- cgit v1.2.3 From b7e4b65f3fe92abbf4a1f57987a54c820969aebd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 May 2020 00:16:31 -0400 Subject: bpf: make bpf_check_uarg_tail_zero() use check_zeroed_user() ... rather than open-coding it, and badly, at that. Acked-by: Alexei Starovoitov Signed-off-by: Al Viro --- kernel/bpf/syscall.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64783da34202..41ba746ecbc2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -67,32 +67,19 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, size_t actual_size) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - int err; + unsigned char __user *addr = uaddr + expected_size; + int res; if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; - if (unlikely(!access_ok(uaddr, actual_size))) - return -EFAULT; - if (actual_size <= expected_size) return 0; - addr = uaddr + expected_size; - end = uaddr + actual_size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - - return 0; + res = check_zeroed_user(addr, actual_size - expected_size); + if (res < 0) + return res; + return res ? 0 : -E2BIG; } const struct bpf_map_ops bpf_map_offload_ops = { -- cgit v1.2.3 From 305dacf77952e6e62405f916654f38423c78ab2f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:31 -0700 Subject: padata: remove exit routine Patch series "padata: parallelize deferred page init", v3. Deferred struct page init is a bottleneck in kernel boot--the biggest for us and probably others. Optimizing it maximizes availability for large-memory systems and allows spinning up short-lived VMs as needed without having to leave them running. It also benefits bare metal machines hosting VMs that are sensitive to downtime. In projects such as VMM Fast Restart[1], where guest state is preserved across kexec reboot, it helps prevent application and network timeouts in the guests. So, multithread deferred init to take full advantage of system memory bandwidth. Extend padata, a framework that handles many parallel singlethreaded jobs, to handle multithreaded jobs as well by adding support for splitting up the work evenly, specifying a minimum amount of work that's appropriate for one helper thread to do, load balancing between helpers, and coordinating them. More documentation in patches 4 and 8. This series is the first step in a project to address other memory proportional bottlenecks in the kernel such as pmem struct page init, vfio page pinning, hugetlb fallocate, and munmap. Deferred page init doesn't require concurrency limits, resource control, or priority adjustments like these other users will because it happens during boot when the system is otherwise idle and waiting for page init to finish. This has been run on a variety of x86 systems and speeds up kernel boot by 4% to 49%, saving up to 1.6 out of 4 seconds. Patch 6 has more numbers. This patch (of 8): padata_driver_exit() is unnecessary because padata isn't built as a module and doesn't exit. padata's init routine will soon allocate memory, so getting rid of the exit function now avoids pointless code to free it. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-1-daniel.m.jordan@oracle.com Link: http://lkml.kernel.org/r/20200527173608.2885243-2-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- kernel/padata.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index aae789896616..b4c7e3e38a05 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1074,10 +1074,4 @@ static __init int padata_driver_init(void) } module_init(padata_driver_init); -static __exit void padata_driver_exit(void) -{ - cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); - cpuhp_remove_multi_state(hp_online); -} -module_exit(padata_driver_exit); #endif -- cgit v1.2.3 From f1b192b117cd418bacf42a9583d7a01855a18fe5 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:35 -0700 Subject: padata: initialize earlier padata will soon initialize the system's struct pages in parallel, so it needs to be ready by page_alloc_init_late(). The error return from padata_driver_init() triggers an initcall warning, so add a warning to padata_init() to avoid silent failure. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-3-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- kernel/padata.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b4c7e3e38a05..0c5f4f78f2c2 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -31,7 +31,6 @@ #include #include #include -#include #define MAX_OBJ_NUM 1000 @@ -1052,26 +1051,26 @@ void padata_free_shell(struct padata_shell *ps) } EXPORT_SYMBOL(padata_free_shell); -#ifdef CONFIG_HOTPLUG_CPU - -static __init int padata_driver_init(void) +void __init padata_init(void) { +#ifdef CONFIG_HOTPLUG_CPU int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", padata_cpu_online, NULL); if (ret < 0) - return ret; + goto err; hp_online = ret; ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); if (ret < 0) { cpuhp_remove_multi_state(hp_online); - return ret; + goto err; } - return 0; -} -module_init(padata_driver_init); + return; +err: + pr_warn("padata: initialization failed\n"); #endif +} -- cgit v1.2.3 From 4611ce22468895acd61fee9ac1da810d60617d9a Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:39 -0700 Subject: padata: allocate work structures for parallel jobs from a pool padata allocates per-CPU, per-instance work structs for parallel jobs. A do_parallel call assigns a job to a sequence number and hashes the number to a CPU, where the job will eventually run using the corresponding work. This approach fit with how padata used to bind a job to each CPU round-robin, makes less sense after commit bfde23ce200e6 ("padata: unbind parallel jobs from specific CPUs") because a work isn't bound to a particular CPU anymore, and isn't needed at all for multithreaded jobs because they don't have sequence numbers. Replace the per-CPU works with a preallocated pool, which allows sharing them between existing padata users and the upcoming multithreaded user. The pool will also facilitate setting NUMA-aware concurrency limits with later users. The pool is sized according to the number of possible CPUs. With this limit, MAX_OBJ_NUM no longer makes sense, so remove it. If the global pool is exhausted, a parallel job is run in the current task instead to throttle a system trying to do too much in parallel. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-4-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- kernel/padata.c | 118 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 0c5f4f78f2c2..d779a73d4d92 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -32,7 +32,15 @@ #include #include -#define MAX_OBJ_NUM 1000 +struct padata_work { + struct work_struct pw_work; + struct list_head pw_list; /* padata_free_works linkage */ + void *pw_data; +}; + +static DEFINE_SPINLOCK(padata_works_lock); +static struct padata_work *padata_works; +static LIST_HEAD(padata_free_works); static void padata_free_pd(struct parallel_data *pd); @@ -58,30 +66,44 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *parallel_work) +static struct padata_work *padata_work_alloc(void) { - struct padata_parallel_queue *pqueue; - LIST_HEAD(local_list); + struct padata_work *pw; - local_bh_disable(); - pqueue = container_of(parallel_work, - struct padata_parallel_queue, work); + lockdep_assert_held(&padata_works_lock); - spin_lock(&pqueue->parallel.lock); - list_replace_init(&pqueue->parallel.list, &local_list); - spin_unlock(&pqueue->parallel.lock); + if (list_empty(&padata_free_works)) + return NULL; /* No more work items allowed to be queued. */ - while (!list_empty(&local_list)) { - struct padata_priv *padata; + pw = list_first_entry(&padata_free_works, struct padata_work, pw_list); + list_del(&pw->pw_list); + return pw; +} - padata = list_entry(local_list.next, - struct padata_priv, list); +static void padata_work_init(struct padata_work *pw, work_func_t work_fn, + void *data) +{ + INIT_WORK(&pw->pw_work, work_fn); + pw->pw_data = data; +} - list_del_init(&padata->list); +static void padata_work_free(struct padata_work *pw) +{ + lockdep_assert_held(&padata_works_lock); + list_add(&pw->pw_list, &padata_free_works); +} - padata->parallel(padata); - } +static void padata_parallel_worker(struct work_struct *parallel_work) +{ + struct padata_work *pw = container_of(parallel_work, struct padata_work, + pw_work); + struct padata_priv *padata = pw->pw_data; + local_bh_disable(); + padata->parallel(padata); + spin_lock(&padata_works_lock); + padata_work_free(pw); + spin_unlock(&padata_works_lock); local_bh_enable(); } @@ -105,9 +127,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, target_cpu, err; - struct padata_parallel_queue *queue; + int i, cpu, cpu_index, err; struct parallel_data *pd; + struct padata_work *pw; rcu_read_lock_bh(); @@ -135,25 +157,25 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) - goto out; - - err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = *cb_cpu; - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - target_cpu = padata_cpu_hash(pd, padata->seq_nr); - padata->cpu = target_cpu; - queue = per_cpu_ptr(pd->pqueue, target_cpu); - - spin_lock(&queue->parallel.lock); - list_add_tail(&padata->list, &queue->parallel.list); - spin_unlock(&queue->parallel.lock); + rcu_read_unlock_bh(); - queue_work(pinst->parallel_wq, &queue->work); + spin_lock(&padata_works_lock); + padata->seq_nr = ++pd->seq_nr; + pw = padata_work_alloc(); + spin_unlock(&padata_works_lock); + if (pw) { + padata_work_init(pw, padata_parallel_worker, padata); + queue_work(pinst->parallel_wq, &pw->pw_work); + } else { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + return 0; out: rcu_read_unlock_bh(); @@ -324,8 +346,9 @@ static void padata_serial_worker(struct work_struct *serial_work) void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; + int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - padata->cpu); + hashed_cpu); struct padata_priv *cur; spin_lock(&pqueue->reorder.lock); @@ -416,8 +439,6 @@ static void padata_init_pqueues(struct parallel_data *pd) pqueue = per_cpu_ptr(pd->pqueue, cpu); __padata_list_init(&pqueue->reorder); - __padata_list_init(&pqueue->parallel); - INIT_WORK(&pqueue->work, padata_parallel_worker); atomic_set(&pqueue->num_obj, 0); } } @@ -451,7 +472,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_pqueues(pd); padata_init_squeues(pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); @@ -1053,6 +1074,7 @@ EXPORT_SYMBOL(padata_free_shell); void __init padata_init(void) { + unsigned int i, possible_cpus; #ifdef CONFIG_HOTPLUG_CPU int ret; @@ -1064,13 +1086,27 @@ void __init padata_init(void) ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); - if (ret < 0) { - cpuhp_remove_multi_state(hp_online); - goto err; - } + if (ret < 0) + goto remove_online_state; +#endif + + possible_cpus = num_possible_cpus(); + padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work), + GFP_KERNEL); + if (!padata_works) + goto remove_dead_state; + + for (i = 0; i < possible_cpus; ++i) + list_add(&padata_works[i].pw_list, &padata_free_works); return; + +remove_dead_state: +#ifdef CONFIG_HOTPLUG_CPU + cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); +remove_online_state: + cpuhp_remove_multi_state(hp_online); err: - pr_warn("padata: initialization failed\n"); #endif + pr_warn("padata: initialization failed\n"); } -- cgit v1.2.3 From 004ed42638f4428e70ead59d170f3d17ff761a0f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:43 -0700 Subject: padata: add basic support for multithreaded jobs Sometimes the kernel doesn't take full advantage of system memory bandwidth, leading to a single CPU spending excessive time in initialization paths where the data scales with memory size. Multithreading naturally addresses this problem. Extend padata, a framework that handles many parallel yet singlethreaded jobs, to also handle multithreaded jobs by adding support for splitting up the work evenly, specifying a minimum amount of work that's appropriate for one helper thread to do, load balancing between helpers, and coordinating them. This is inspired by work from Pavel Tatashin and Steve Sistare. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-5-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- kernel/padata.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 149 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index d779a73d4d92..29fc5d87a4cd 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -7,6 +7,9 @@ * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan + * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. @@ -21,6 +24,7 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ +#include #include #include #include @@ -32,6 +36,8 @@ #include #include +#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */ + struct padata_work { struct work_struct pw_work; struct list_head pw_list; /* padata_free_works linkage */ @@ -42,7 +48,17 @@ static DEFINE_SPINLOCK(padata_works_lock); static struct padata_work *padata_works; static LIST_HEAD(padata_free_works); +struct padata_mt_job_state { + spinlock_t lock; + struct completion completion; + struct padata_mt_job *job; + int nworks; + int nworks_fini; + unsigned long chunk_size; +}; + static void padata_free_pd(struct parallel_data *pd); +static void __init padata_mt_helper(struct work_struct *work); static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { @@ -81,18 +97,56 @@ static struct padata_work *padata_work_alloc(void) } static void padata_work_init(struct padata_work *pw, work_func_t work_fn, - void *data) + void *data, int flags) { - INIT_WORK(&pw->pw_work, work_fn); + if (flags & PADATA_WORK_ONSTACK) + INIT_WORK_ONSTACK(&pw->pw_work, work_fn); + else + INIT_WORK(&pw->pw_work, work_fn); pw->pw_data = data; } +static int __init padata_work_alloc_mt(int nworks, void *data, + struct list_head *head) +{ + int i; + + spin_lock(&padata_works_lock); + /* Start at 1 because the current task participates in the job. */ + for (i = 1; i < nworks; ++i) { + struct padata_work *pw = padata_work_alloc(); + + if (!pw) + break; + padata_work_init(pw, padata_mt_helper, data, 0); + list_add(&pw->pw_list, head); + } + spin_unlock(&padata_works_lock); + + return i; +} + static void padata_work_free(struct padata_work *pw) { lockdep_assert_held(&padata_works_lock); list_add(&pw->pw_list, &padata_free_works); } +static void __init padata_works_free(struct list_head *works) +{ + struct padata_work *cur, *next; + + if (list_empty(works)) + return; + + spin_lock(&padata_works_lock); + list_for_each_entry_safe(cur, next, works, pw_list) { + list_del(&cur->pw_list); + padata_work_free(cur); + } + spin_unlock(&padata_works_lock); +} + static void padata_parallel_worker(struct work_struct *parallel_work) { struct padata_work *pw = container_of(parallel_work, struct padata_work, @@ -168,7 +222,7 @@ int padata_do_parallel(struct padata_shell *ps, pw = padata_work_alloc(); spin_unlock(&padata_works_lock); if (pw) { - padata_work_init(pw, padata_parallel_worker, padata); + padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); } else { /* Maximum works limit exceeded, run in the current task. */ @@ -409,6 +463,98 @@ out: return err; } +static void __init padata_mt_helper(struct work_struct *w) +{ + struct padata_work *pw = container_of(w, struct padata_work, pw_work); + struct padata_mt_job_state *ps = pw->pw_data; + struct padata_mt_job *job = ps->job; + bool done; + + spin_lock(&ps->lock); + + while (job->size > 0) { + unsigned long start, size, end; + + start = job->start; + /* So end is chunk size aligned if enough work remains. */ + size = roundup(start + 1, ps->chunk_size) - start; + size = min(size, job->size); + end = start + size; + + job->start = end; + job->size -= size; + + spin_unlock(&ps->lock); + job->thread_fn(start, end, job->fn_arg); + spin_lock(&ps->lock); + } + + ++ps->nworks_fini; + done = (ps->nworks_fini == ps->nworks); + spin_unlock(&ps->lock); + + if (done) + complete(&ps->completion); +} + +/** + * padata_do_multithreaded - run a multithreaded job + * @job: Description of the job. + * + * See the definition of struct padata_mt_job for more details. + */ +void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + /* In case threads finish at different times. */ + static const unsigned long load_balance_factor = 4; + struct padata_work my_work, *pw; + struct padata_mt_job_state ps; + LIST_HEAD(works); + int nworks; + + if (job->size == 0) + return; + + /* Ensure at least one thread when size < min_chunk. */ + nworks = max(job->size / job->min_chunk, 1ul); + nworks = min(nworks, job->max_threads); + + if (nworks == 1) { + /* Single thread, no coordination needed, cut to the chase. */ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); + return; + } + + spin_lock_init(&ps.lock); + init_completion(&ps.completion); + ps.job = job; + ps.nworks = padata_work_alloc_mt(nworks, &ps, &works); + ps.nworks_fini = 0; + + /* + * Chunk size is the amount of work a helper does per call to the + * thread function. Load balance large jobs between threads by + * increasing the number of chunks, guarantee at least the minimum + * chunk size from the caller, and honor the caller's alignment. + */ + ps.chunk_size = job->size / (ps.nworks * load_balance_factor); + ps.chunk_size = max(ps.chunk_size, job->min_chunk); + ps.chunk_size = roundup(ps.chunk_size, job->align); + + list_for_each_entry(pw, &works, pw_list) + queue_work(system_unbound_wq, &pw->pw_work); + + /* Use the current thread, which saves starting a workqueue worker. */ + padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); + padata_mt_helper(&my_work.pw_work); + + /* Wait for all the helpers to finish. */ + wait_for_completion(&ps.completion); + + destroy_work_on_stack(&my_work.pw_work); + padata_works_free(&works); +} + static void __padata_list_init(struct padata_list *pd_list) { INIT_LIST_HEAD(&pd_list->list); -- cgit v1.2.3 From 3fba69a56e16e8dcf182fe6ca77735dd65a898aa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:31 -0700 Subject: mm: memcontrol: drop @compound parameter from memcg charging API The memcg charging API carries a boolean @compound parameter that tells whether the page we're dealing with is a hugepage. mem_cgroup_commit_charge() has another boolean @lrucare that indicates whether the page needs LRU locking or not while charging. The majority of callsites know those parameters at compile time, which results in a lot of naked "false, false" argument lists. This makes for cryptic code and is a breeding ground for subtle mistakes. Thankfully, the huge page state can be inferred from the page itself and doesn't need to be passed along. This is safe because charging completes before the page is published and somebody may split it. Simplify the callsites by removing @compound, and let memcg infer the state by using hpage_nr_pages() unconditionally. That function does PageTransHuge() to identify huge pages, which also helpfully asserts that nobody passes in tail pages by accident. The following patches will introduce a new charging API, best not to carry over unnecessary weight. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Alex Shi Reviewed-by: Joonsoo Kim Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-4-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ece7e13f6e4a..40e7488ce467 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -169,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false); + &memcg); if (err) return err; } @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { if (new_page) - mem_cgroup_cancel_charge(new_page, memcg, false); + mem_cgroup_cancel_charge(new_page, memcg); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); @@ -189,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); + mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ -- cgit v1.2.3 From be5d0a74c62d8da43f9526a5b08cdd18e2bbc37a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:57 -0700 Subject: mm: memcontrol: switch to native NR_ANON_MAPPED counter Memcg maintains a private MEMCG_RSS counter. This divergence from the generic VM accounting means unnecessary code overhead, and creates a dependency for memcg that page->mapping is set up at the time of charging, so that page types can be told apart. Convert the generic accounting sites to mod_lruvec_page_state and friends to maintain the per-cgroup vmstat counter of NR_ANON_MAPPED. We use lock_page_memcg() to stabilize page->mem_cgroup during rmap changes, the same way we do for NR_FILE_MAPPED. With the previous patch removing MEMCG_CACHE and the private NR_SHMEM counter, this patch finally eliminates the need to have page->mapping set up at charge time. However, we need to have page->mem_cgroup set up by the time rmap runs and does the accounting, so switch the commit and the rmap callbacks around. v2: fix temporary accounting bug by switching rmap<->commit (Joonsoo) Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Alex Shi Cc: Hugh Dickins Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-11-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 40e7488ce467..89ef81b65bcb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -188,8 +188,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, addr, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ -- cgit v1.2.3 From 9d82c69438d0dff8809061edbcce43a5a4bcf09f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:04 -0700 Subject: mm: memcontrol: convert anon and file-thp to new mem_cgroup_charge() API With the page->mapping requirement gone from memcg, we can charge anon and file-thp pages in one single step, right after they're allocated. This removes two out of three API calls - especially the tricky commit step that needed to happen at just the right time between when the page is "set up" and when it's "published" - somewhat vague and fluid concepts that varied by page type. All we need is a freshly allocated page and a memcg context to charge. v2: prevent double charges on pre-allocated hugepages in khugepaged [hannes@cmpxchg.org: Fix crash - *hpage could be ERR_PTR instead of NULL] Link: http://lkml.kernel.org/r/20200512215813.GA487759@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Cc: Qian Cai Link: http://lkml.kernel.org/r/20200508183105.225460-13-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 89ef81b65bcb..4253c153e985 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -162,14 +162,13 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, }; int err; struct mmu_notifier_range range; - struct mem_cgroup *memcg; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL, + false); if (err) return err; } @@ -179,16 +178,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) { - if (new_page) - mem_cgroup_cancel_charge(new_page, memcg); + if (!page_vma_mapped_walk(&pvmw)) goto unlock; - } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { get_page(new_page); - mem_cgroup_commit_charge(new_page, memcg, false); page_add_new_anon_rmap(new_page, vma, addr, false); lru_cache_add_active_or_unevictable(new_page, vma); } else -- cgit v1.2.3 From d9eb1ea2bf8734afd8ec7d995270437a7242f82b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:24 -0700 Subject: mm: memcontrol: delete unused lrucare handling Swapin faults were the last event to charge pages after they had already been put on the LRU list. Now that we charge directly on swapin, the lrucare portion of the charge code is unused. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Balbir Singh Cc: Shakeel Butt Link: http://lkml.kernel.org/r/20200508183105.225460-19-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4253c153e985..eddc8db96027 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,8 +167,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL, - false); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); if (err) return err; } -- cgit v1.2.3 From c843966c556d7370bb32e7319a6d164cb8c70ae2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:37 -0700 Subject: mm: allow swappiness that prefers reclaiming anon over the file workingset With the advent of fast random IO devices (SSDs, PMEM) and in-memory swap devices such as zswap, it's possible for swap to be much faster than filesystems, and for swapping to be preferable over thrashing filesystem caches. Allow setting swappiness - which defines the rough relative IO cost of cache misses between page cache and swap-backed pages - to reflect such situations by making the swap-preferred range configurable. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Link: http://lkml.kernel.org/r/20200520232525.798933-4-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..7f15d292e44c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -131,6 +131,7 @@ static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; static int one_hundred = 100; +static int two_hundred = 200; static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; @@ -1391,7 +1392,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = &two_hundred, }, #ifdef CONFIG_HUGETLB_PAGE { -- cgit v1.2.3 From 333ed74689b8fca097574124fef7fa0e3d7f79d4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 3 Jun 2020 12:16:37 +0100 Subject: scs: Report SCS usage in bytes rather than number of entries Fix the SCS debug usage check so that we report the number of bytes used, rather than the number of entries. Fixes: 5bbaf9d1fcb9 ("scs: Add support for stack usage debugging") Reported-by: Sami Tolvanen Reviewed-by: Kees Cook Signed-off-by: Will Deacon --- kernel/scs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 222a7a9ad543..5d4d9bbdec36 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -74,7 +74,7 @@ static void scs_check_usage(struct task_struct *tsk) for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { if (!READ_ONCE_NOCHECK(*p)) break; - used++; + used += sizeof(*p); } while (used > curr) { -- cgit v1.2.3 From e7ed83d6fa1a00d0f2ad0327e73d3ea9e7ea8de1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Jun 2020 11:54:36 +0300 Subject: bpf: Fix an error code in check_btf_func() This code returns success if the "info_aux" allocation fails but it should return -ENOMEM. Fixes: 8c1b6e69dcc1 ("bpf: Compare BTF types of functions arguments with actual types") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200604085436.GA943001@mwanda --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c7bbaac81ef..34cde841ab68 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7552,7 +7552,7 @@ static int check_btf_func(struct bpf_verifier_env *env, const struct btf *btf; void __user *urecord; u32 prev_offset = 0; - int ret = 0; + int ret = -ENOMEM; nfuncs = attr->func_info_cnt; if (!nfuncs) -- cgit v1.2.3 From 3c61df3885e91f8737bbbbaba79b908da0e1919f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:48 -0700 Subject: kcov: cleanup debug messages Patch series "kcov: collect coverage from usb soft interrupts", v4. This patchset extends kcov to allow collecting coverage from soft interrupts and then uses the new functionality to collect coverage from USB code. This has allowed to find at least one new HID bug [1], which was recently fixed by Alan [2]. [1] https://syzkaller.appspot.com/bug?extid=09ef48aa58261464b621 [2] https://patchwork.kernel.org/patch/11283319/ Any subsystem that uses softirqs (e.g. timers) can make use of this in the future. Looking at the recent syzbot reports, an obvious candidate is the networking subsystem [3, 4, 5 and many more]. [3] https://syzkaller.appspot.com/bug?extid=522ab502c69badc66ab7 [4] https://syzkaller.appspot.com/bug?extid=57f89d05946c53dbbb31 [5] https://syzkaller.appspot.com/bug?extid=df358e65d9c1b9d3f5f4 This pach (of 7): Previous commit left a lot of excessive debug messages, clean them up. Link; http://lkml.kernel.org/r/cover.1585233617.git.andreyknvl@google.com Link; http://lkml.kernel.org/r/ab5e2885ce674ba6e04368551e51eeb6a2c11baf.1585233617.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Alexander Potapenko Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/4a497134b2cf7a9d306d28e3dd2746f5446d1605.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 8accc9722a81..e6bb2b50569f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -98,6 +98,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle) return NULL; } +/* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) { struct kcov_remote *remote; @@ -119,16 +120,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) struct kcov_remote_area *area; struct list_head *pos; - kcov_debug("size = %u\n", size); list_for_each(pos, &kcov_remote_areas) { area = list_entry(pos, struct kcov_remote_area, list); if (area->size == size) { list_del(&area->list); - kcov_debug("rv = %px\n", area); return area; } } - kcov_debug("rv = NULL\n"); return NULL; } @@ -136,7 +134,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) static void kcov_remote_area_put(struct kcov_remote_area *area, unsigned int size) { - kcov_debug("area = %px, size = %u\n", area, size); INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); @@ -366,7 +363,6 @@ static void kcov_remote_reset(struct kcov *kcov) hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; - kcov_debug("removing handle %llx\n", remote->handle); hash_del(&remote->hnode); kfree(remote); } @@ -553,7 +549,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, switch (cmd) { case KCOV_INIT_TRACE: - kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -572,7 +567,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: - kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -598,7 +592,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_get(kcov); return 0; case KCOV_DISABLE: - kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -610,7 +603,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_put(kcov); return 0; case KCOV_REMOTE_ENABLE: - kcov_debug("KCOV_REMOTE_ENABLE\n"); if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; t = current; @@ -629,7 +621,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->remote_size = remote_arg->area_size; spin_lock(&kcov_remote_lock); for (i = 0; i < remote_arg->num_handles; i++) { - kcov_debug("handle %llx\n", remote_arg->handles[i]); if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { spin_unlock(&kcov_remote_lock); @@ -644,8 +635,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, } } if (remote_arg->common_handle) { - kcov_debug("common handle %llx\n", - remote_arg->common_handle); if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { spin_unlock(&kcov_remote_lock); @@ -782,7 +771,6 @@ void kcov_remote_start(u64 handle) spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - kcov_debug("no remote found"); spin_unlock(&kcov_remote_lock); return; } @@ -810,8 +798,6 @@ void kcov_remote_start(u64 handle) /* Reset coverage size. */ *(u64 *)area = 0; - kcov_debug("area = %px, size = %u", area, size); - kcov_start(t, size, area, mode, sequence); } @@ -881,10 +867,8 @@ void kcov_remote_stop(void) unsigned int size = t->kcov_size; int sequence = t->kcov_sequence; - if (!kcov) { - kcov_debug("no kcov found\n"); + if (!kcov) return; - } kcov_stop(t); t->kcov = NULL; @@ -894,8 +878,6 @@ void kcov_remote_stop(void) * KCOV_DISABLE could have been called between kcov_remote_start() * and kcov_remote_stop(), hence the check. */ - kcov_debug("move if: %d == %d && %d\n", - sequence, kcov->sequence, (int)kcov->remote); if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); -- cgit v1.2.3 From 67b3d3cca385507c4c8b6ad97b823415e038e3c8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:51 -0700 Subject: kcov: fix potential use-after-free in kcov_remote_start If vmalloc() fails in kcov_remote_start() we'll access remote->kcov without holding kcov_remote_lock, so remote might potentially be freed at that point. Cache kcov pointer in a local variable. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/9d9134359725a965627b7e8f2652069f86f1d1fa.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/de0d3d30ff90776a2a509cc34c7c1c7521bda125.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index e6bb2b50569f..14e7208c5291 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -748,6 +748,7 @@ static const struct file_operations kcov_fops = { void kcov_remote_start(u64 handle) { struct kcov_remote *remote; + struct kcov *kcov; void *area; struct task_struct *t; unsigned int size; @@ -774,16 +775,17 @@ void kcov_remote_start(u64 handle) spin_unlock(&kcov_remote_lock); return; } + kcov = remote->kcov; /* Put in kcov_remote_stop(). */ - kcov_get(remote->kcov); - t->kcov = remote->kcov; + kcov_get(kcov); + t->kcov = kcov; /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = remote->kcov->remote_size; - mode = remote->kcov->mode; - sequence = remote->kcov->sequence; + size = kcov->remote_size; + mode = kcov->mode; + sequence = kcov->sequence; area = kcov_remote_area_get(size); spin_unlock(&kcov_remote_lock); @@ -791,7 +793,7 @@ void kcov_remote_start(u64 handle) area = vmalloc(size * sizeof(unsigned long)); if (!area) { t->kcov = NULL; - kcov_put(remote->kcov); + kcov_put(kcov); return; } } -- cgit v1.2.3 From 76484b1c77242b737f8fd001d6e00af7518221f3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:55 -0700 Subject: kcov: move t->kcov assignments into kcov_start/stop Every time kcov_start/stop() is called, t->kcov is also assigned, so move the assignment into the functions. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/6644839d3567df61ade3c4b246a46cacbe4f9e11.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/82625ef3ff878f0b585763cc31d09d9b08ca37d6.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 14e7208c5291..96dbc198d166 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -309,10 +309,12 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ -static void kcov_start(struct task_struct *t, unsigned int size, - void *area, enum kcov_mode mode, int sequence) +static void kcov_start(struct task_struct *t, struct kcov *kcov, + unsigned int size, void *area, enum kcov_mode mode, + int sequence) { kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + t->kcov = kcov; /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; @@ -326,6 +328,7 @@ static void kcov_stop(struct task_struct *t) { WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); barrier(); + t->kcov = NULL; t->kcov_size = 0; t->kcov_area = NULL; } @@ -333,7 +336,6 @@ static void kcov_stop(struct task_struct *t) static void kcov_task_reset(struct task_struct *t) { kcov_stop(t); - t->kcov = NULL; t->kcov_sequence = 0; t->kcov_handle = 0; } @@ -584,9 +586,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return mode; kcov_fault_in_area(kcov); kcov->mode = mode; - kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode, kcov->sequence); - t->kcov = kcov; kcov->t = t; /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); @@ -778,7 +779,6 @@ void kcov_remote_start(u64 handle) kcov = remote->kcov; /* Put in kcov_remote_stop(). */ kcov_get(kcov); - t->kcov = kcov; /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). @@ -792,7 +792,6 @@ void kcov_remote_start(u64 handle) if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { - t->kcov = NULL; kcov_put(kcov); return; } @@ -800,7 +799,7 @@ void kcov_remote_start(u64 handle) /* Reset coverage size. */ *(u64 *)area = 0; - kcov_start(t, size, area, mode, sequence); + kcov_start(t, kcov, size, area, mode, sequence); } EXPORT_SYMBOL(kcov_remote_start); @@ -873,7 +872,6 @@ void kcov_remote_stop(void) return; kcov_stop(t); - t->kcov = NULL; spin_lock(&kcov->lock); /* -- cgit v1.2.3 From eeb91f9a2e3e9766ae9fd1117bd19d87538f21bf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:58 -0700 Subject: kcov: move t->kcov_sequence assignment Move t->kcov_sequence assignment before assigning t->kcov_mode for consistency. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/5889efe35e0b300e69dba97216b1288d9c2428a8.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/f0283c676bab3335cb48bfe12d375a3da4719f59.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 96dbc198d166..7cd05bd1fada 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -318,10 +318,10 @@ static void kcov_start(struct task_struct *t, struct kcov *kcov, /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; + t->kcov_sequence = sequence; /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, mode); - t->kcov_sequence = sequence; } static void kcov_stop(struct task_struct *t) -- cgit v1.2.3 From 5fe7042dc0a2e80b4633df20dcd06b93e76e3c31 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:46:01 -0700 Subject: kcov: use t->kcov_mode as enabled indicator Currently kcov_remote_start() and kcov_remote_stop() check t->kcov to find out whether the coverage is already being collected by the current task. Use t->kcov_mode for that instead. This doesn't change the overall behavior in any way, but serves as a preparation for the following softirq coverage collection support patch. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/f70377945d1d8e6e4916cbce871a12303d6186b4.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/ee1a1dec43059da5d7664c85c1addc89c4cd58de.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 7cd05bd1fada..93b28ad2da28 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -746,26 +746,33 @@ static const struct file_operations kcov_fops = { * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ + +static inline bool kcov_mode_enabled(unsigned int mode) +{ + return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; +} + void kcov_remote_start(u64 handle) { + struct task_struct *t = current; struct kcov_remote *remote; struct kcov *kcov; + unsigned int mode; void *area; - struct task_struct *t; unsigned int size; - enum kcov_mode mode; int sequence; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; if (WARN_ON(!in_task())) return; - t = current; + /* * Check that kcov_remote_start is not called twice * nor called by user tasks (with enabled kcov). */ - if (WARN_ON(t->kcov)) + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(kcov_mode_enabled(mode))) return; kcov_debug("handle = %llx\n", handle); @@ -863,13 +870,20 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, void kcov_remote_stop(void) { struct task_struct *t = current; - struct kcov *kcov = t->kcov; - void *area = t->kcov_area; - unsigned int size = t->kcov_size; - int sequence = t->kcov_sequence; + struct kcov *kcov; + unsigned int mode; + void *area; + unsigned int size; + int sequence; - if (!kcov) + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) return; + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; kcov_stop(t); -- cgit v1.2.3 From 5ff3b30ab57da82d8db4f14662a2858cabfbc2c0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:46:04 -0700 Subject: kcov: collect coverage from interrupts This change extends kcov remote coverage support to allow collecting coverage from soft interrupts in addition to kernel background threads. To collect coverage from code that is executed in softirq context, a part of that code has to be annotated with kcov_remote_start/stop() in a similar way as how it is done for global kernel background threads. Then the handle used for the annotations has to be passed to the KCOV_REMOTE_ENABLE ioctl. Internally this patch adjusts the __sanitizer_cov_trace_pc() compiler inserted callback to not bail out when called from softirq context. kcov_remote_start/stop() are updated to save/restore the current per task kcov state in a per-cpu area (in case the softirq came when the kernel was already collecting coverage in task context). Coverage from softirqs is collected into pre-allocated per-cpu areas, whose size is controlled by the new CONFIG_KCOV_IRQ_AREA_SIZE. [andreyknvl@google.com: turn current->kcov_softirq into unsigned int to fix objtool warning] Link: http://lkml.kernel.org/r/841c778aa3849c5cb8c3761f56b87ce653a88671.1585233617.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Greg Kroah-Hartman Cc: Marco Elver Link: http://lkml.kernel.org/r/469bd385c431d050bc38a593296eff4baae50666.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 155 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 93b28ad2da28..55c5d883a93e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock); static DEFINE_HASHTABLE(kcov_remote_map, 4); static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); +struct kcov_percpu_data { + void *irq_area; + + unsigned int saved_mode; + unsigned int saved_size; + void *saved_area; + struct kcov *saved_kcov; + int saved_sequence; +}; + +DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); + /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) { @@ -145,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru /* * We are interested in code coverage as a function of a syscall inputs, - * so we ignore code executed in interrupts. + * so we ignore code executed in interrupts, unless we are in a remote + * coverage collection section in a softirq. */ - if (!in_task()) + if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -360,8 +373,9 @@ static void kcov_remote_reset(struct kcov *kcov) int bkt; struct kcov_remote *remote; struct hlist_node *tmp; + unsigned long flags; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; @@ -370,7 +384,7 @@ static void kcov_remote_reset(struct kcov *kcov) } /* Do reset before unlock to prevent races with kcov_remote_start(). */ kcov_reset(kcov); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); } static void kcov_disable(struct task_struct *t, struct kcov *kcov) @@ -399,12 +413,13 @@ static void kcov_put(struct kcov *kcov) void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; + unsigned long flags; kcov = t->kcov; if (kcov == NULL) return; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); /* * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, @@ -428,12 +443,12 @@ void kcov_task_exit(struct task_struct *t) * By combining all three checks into one we get: */ if (WARN_ON(kcov->t != t)) { - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); return; } /* Just to not leave dangling references behind. */ kcov_disable(t, kcov); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kcov_put(kcov); } @@ -444,12 +459,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; + unsigned long flags; area = vmalloc_user(vma->vm_end - vma->vm_start); if (!area) return -ENOMEM; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { @@ -459,7 +475,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) if (!kcov->area) { kcov->area = area; vma->vm_flags |= VM_DONTEXPAND; - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); if (vm_insert_page(vma, vma->vm_start + off, page)) @@ -468,7 +484,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) return 0; } exit: - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); vfree(area); return res; } @@ -548,6 +564,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; + unsigned long flags; switch (cmd) { case KCOV_INIT_TRACE: @@ -620,17 +637,19 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); for (i = 0; i < remote_arg->num_handles; i++) { if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->handles[i]); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } @@ -638,20 +657,22 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, if (remote_arg->common_handle) { if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->common_handle); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } t->kcov_handle = remote_arg->common_handle; } - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; @@ -667,6 +688,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; + unsigned long flags; if (cmd == KCOV_REMOTE_ENABLE) { if (get_user(remote_num_handles, (unsigned __user *)(arg + @@ -687,9 +709,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } kcov = filep->private_data; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kfree(remote_arg); @@ -706,8 +728,8 @@ static const struct file_operations kcov_fops = { /* * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section - * of code in a kernel background thread to allow kcov to be used to collect - * coverage from that part of code. + * of code in a kernel background thread or in a softirq to allow kcov to be + * used to collect coverage from that part of code. * * The handle argument of kcov_remote_start() identifies a code section that is * used for coverage collection. A userspace process passes this handle to @@ -718,9 +740,9 @@ static const struct file_operations kcov_fops = { * the type of the kernel thread whose code is being annotated. * * For global kernel threads that are spawned in a limited number of instances - * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each - * instance must be assigned a unique 4-byte instance id. The instance id is - * then combined with a 1-byte subsystem id to get a handle via + * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for + * softirqs, each instance must be assigned a unique 4-byte instance id. The + * instance id is then combined with a 1-byte subsystem id to get a handle via * kcov_remote_handle(subsystem_id, instance_id). * * For local kernel threads that are spawned from system calls handler when a @@ -739,7 +761,7 @@ static const struct file_operations kcov_fops = { * * See Documentation/dev-tools/kcov.rst for more details. * - * Internally, this function looks up the kcov device associated with the + * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to * be collected via __sanitizer_cov_trace_pc() @@ -752,6 +774,39 @@ static inline bool kcov_mode_enabled(unsigned int mode) return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; } +void kcov_remote_softirq_start(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + unsigned int mode; + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (kcov_mode_enabled(mode)) { + data->saved_mode = mode; + data->saved_size = t->kcov_size; + data->saved_area = t->kcov_area; + data->saved_sequence = t->kcov_sequence; + data->saved_kcov = t->kcov; + kcov_stop(t); + } +} + +void kcov_remote_softirq_stop(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + + if (data->saved_kcov) { + kcov_start(t, data->saved_kcov, data->saved_size, + data->saved_area, data->saved_mode, + data->saved_sequence); + data->saved_mode = 0; + data->saved_size = 0; + data->saved_area = NULL; + data->saved_sequence = 0; + data->saved_kcov = NULL; + } +} + void kcov_remote_start(u64 handle) { struct task_struct *t = current; @@ -761,28 +816,42 @@ void kcov_remote_start(u64 handle) void *area; unsigned int size; int sequence; + unsigned long flags; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (WARN_ON(!in_task())) + if (!in_task() && !in_serving_softirq()) return; + local_irq_save(flags); + /* - * Check that kcov_remote_start is not called twice - * nor called by user tasks (with enabled kcov). + * Check that kcov_remote_start() is not called twice in background + * threads nor called by user tasks (with enabled kcov). */ mode = READ_ONCE(t->kcov_mode); - if (WARN_ON(kcov_mode_enabled(mode))) + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { + local_irq_restore(flags); return; - - kcov_debug("handle = %llx\n", handle); + } + /* + * Check that kcov_remote_start() is not called twice in softirqs. + * Note, that kcov_remote_start() can be called from a softirq that + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); return; } + kcov_debug("handle = %llx, context: %s\n", handle, + in_task() ? "task" : "softirq"); kcov = remote->kcov; /* Put in kcov_remote_stop(). */ kcov_get(kcov); @@ -790,12 +859,18 @@ void kcov_remote_start(u64 handle) * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = kcov->remote_size; mode = kcov->mode; sequence = kcov->sequence; - area = kcov_remote_area_get(size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + size = kcov->remote_size; + area = kcov_remote_area_get(size); + } else { + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } + spin_unlock_irqrestore(&kcov_remote_lock, flags); + /* Can only happen when in_task(). */ if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { @@ -803,11 +878,20 @@ void kcov_remote_start(u64 handle) return; } } + + local_irq_save(flags); + /* Reset coverage size. */ *(u64 *)area = 0; + if (in_serving_softirq()) { + kcov_remote_softirq_start(t); + t->kcov_softirq = 1; + } kcov_start(t, kcov, size, area, mode, sequence); + local_irq_restore(flags); + } EXPORT_SYMBOL(kcov_remote_start); @@ -875,31 +959,53 @@ void kcov_remote_stop(void) void *area; unsigned int size; int sequence; + unsigned long flags; + + if (!in_task() && !in_serving_softirq()) + return; + + local_irq_save(flags); mode = READ_ONCE(t->kcov_mode); barrier(); - if (!kcov_mode_enabled(mode)) + if (!kcov_mode_enabled(mode)) { + local_irq_restore(flags); return; + } kcov = t->kcov; area = t->kcov_area; size = t->kcov_size; sequence = t->kcov_sequence; + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } + kcov_stop(t); + if (in_serving_softirq()) { + t->kcov_softirq = 0; + kcov_remote_softirq_stop(t); + } spin_lock(&kcov->lock); /* * KCOV_DISABLE could have been called between kcov_remote_start() - * and kcov_remote_stop(), hence the check. + * and kcov_remote_stop(), hence the sequence check. */ if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); - spin_lock(&kcov_remote_lock); - kcov_remote_area_put(area, size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + } + local_irq_restore(flags); + + /* Get in kcov_remote_start(). */ kcov_put(kcov); } EXPORT_SYMBOL(kcov_remote_stop); @@ -913,6 +1019,16 @@ EXPORT_SYMBOL(kcov_common_handle); static int __init kcov_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; + } + /* * The kcov debugfs file won't ever get removed and thus, * there is no need to protect it against removal races. The -- cgit v1.2.3 From 3fe4f4991a2a818277445bd5b8b289305b7dd15d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:44 -0700 Subject: kexec_file: don't place kexec images on IORESOURCE_MEM_DRIVER_MANAGED Memory flagged with IORESOURCE_MEM_DRIVER_MANAGED is special - it won't be part of the initial memmap of the kexec kernel and not all memory might be accessible. Don't place any kexec images onto it. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Pankaj Gupta Cc: Wei Yang Cc: Baoquan He Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Tatashin Cc: Dan Williams Link: http://lkml.kernel.org/r/20200508084217.9160-4-david@redhat.com Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index faa74d5f6941..bb05fd52de85 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -540,6 +540,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ + + /* Don't use memory that will be detected and handled by a driver. */ + if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + return 0; + if (sz < kbuf->memsz) return 0; -- cgit v1.2.3 From de83dbd97f173650a602c5e356025b732173ecc4 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 4 Jun 2020 16:49:58 -0700 Subject: user.c: make uidhash_table static Fix the following sparse warning: kernel/user.c:85:19: warning: symbol 'uidhash_table' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Andrew Morton Cc: David Howells Cc: Greg Kroah-Hartman Cc: Rasmus Villemoes Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200413082146.22737-1-yanaijie@huawei.com Signed-off-by: Linus Torvalds --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 5235d7f49982..b1635d94a1f2 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; -struct hlist_head uidhash_table[UIDHASH_SZ]; +static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is -- cgit v1.2.3 From eac2cece45074e372f78a459c7bb2d7207b72736 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 4 Jun 2020 16:51:11 -0700 Subject: kernel/kprobes.c: convert to use DEFINE_SEQ_ATTRIBUTE macro Use DEFINE_SEQ_ATTRIBUTE macro to simplify the code. Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Greg KH Cc: Ingo Molnar Cc: Al Viro Link: http://lkml.kernel.org/r/20200509064031.181091-4-wangkefeng.wang@huawei.com Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0fbdee78266b..50cd84f53df0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2475,24 +2475,14 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static const struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_sops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, .show = show_kprobe_addr }; -static int kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobes); /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) @@ -2529,24 +2519,13 @@ static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) mutex_unlock(&kprobe_mutex); } -static const struct seq_operations kprobe_blacklist_seq_ops = { +static const struct seq_operations kprobe_blacklist_sops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; - -static int kprobe_blacklist_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_blacklist_seq_ops); -} - -static const struct file_operations debugfs_kprobe_blacklist_ops = { - .open = kprobe_blacklist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist); static int arm_all_kprobes(void) { @@ -2705,13 +2684,12 @@ static int __init debugfs_kprobe_init(void) dir = debugfs_create_dir("kprobes", NULL); - debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); + debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); + &kprobe_blacklist_fops); return 0; } -- cgit v1.2.3 From 54e200ab40fc14c863bcc80a51e20b7906608fce Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 4 Jun 2020 16:51:27 -0700 Subject: kernel/relay.c: handle alloc_percpu returning NULL in relay_open alloc_percpu() may return NULL, which means chan->buf may be set to NULL. In that case, when we do *per_cpu_ptr(chan->buf, ...), we dereference an invalid pointer: BUG: Unable to handle kernel data access at 0x7dae0000 Faulting instruction address: 0xc0000000003f3fec ... NIP relay_open+0x29c/0x600 LR relay_open+0x270/0x600 Call Trace: relay_open+0x264/0x600 (unreliable) __blk_trace_setup+0x254/0x600 blk_trace_setup+0x68/0xa0 sg_ioctl+0x7bc/0x2e80 do_vfs_ioctl+0x13c/0x1300 ksys_ioctl+0x94/0x130 sys_ioctl+0x48/0xb0 system_call+0x5c/0x68 Check if alloc_percpu returns NULL. This was found by syzkaller both on x86 and powerpc, and the reproducer it found on powerpc is capable of hitting the issue as an unprivileged user. Fixes: 017c59c042d0 ("relay: Use per CPU constructs for the relay channel buffer pointers") Reported-by: syzbot+1e925b4b836afe85a1c6@syzkaller-ppc64.appspotmail.com Reported-by: syzbot+587b2421926808309d21@syzkaller-ppc64.appspotmail.com Reported-by: syzbot+58320b7171734bf79d26@syzkaller.appspotmail.com Reported-by: syzbot+d6074fb08bdb2e010520@syzkaller.appspotmail.com Signed-off-by: Daniel Axtens Signed-off-by: Andrew Morton Reviewed-by: Michael Ellerman Reviewed-by: Andrew Donnellan Acked-by: David Rientjes Cc: Akash Goel Cc: Andrew Donnellan Cc: Guenter Roeck Cc: Salvatore Bonaccorso Cc: [4.10+] Link: http://lkml.kernel.org/r/20191219121256.26480-1-dja@axtens.net Signed-off-by: Linus Torvalds --- kernel/relay.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 90c7a002436d..dc82705e1cff 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename, return NULL; chan->buf = alloc_percpu(struct rchan_buf *); + if (!chan->buf) { + kfree(chan); + return NULL; + } + chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; -- cgit v1.2.3 From 341a7213e5c1ce274cc0f02270054905800ea660 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Thu, 4 Jun 2020 16:51:30 -0700 Subject: kernel/relay.c: fix read_pos error when multiple readers When reading, read_pos should start with bytes_consumed, not file->f_pos. Because when there is more than one reader, the read_pos corresponding to file->f_pos may have been consumed, which will cause the data that has been consumed to be read and the bytes_consumed update error. Signed-off-by: Pengcheng Yang Signed-off-by: Andrew Morton Reviewed-by: Jens Axboe Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Al Viro e Link: http://lkml.kernel.org/r/1579691175-28949-1-git-send-email-yangpc@wangsu.com Signed-off-by: Linus Torvalds --- kernel/relay.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index dc82705e1cff..204867220f8a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -996,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf, /* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; size_t consumed = buf->subbufs_consumed; - relay_file_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, 0, 0); consumed = buf->subbufs_consumed; @@ -1064,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position * @buf: relay channel buffer * - * If the @read_pos is in the middle of padding, return the + * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; + size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; padding_start = (read_subbuf + 1) * subbuf_size - padding; @@ -1136,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp, do { void *from; - if (!relay_file_read_avail(buf, *ppos)) + if (!relay_file_read_avail(buf)) break; - read_start = relay_file_read_start_pos(*ppos, buf); + read_start = relay_file_read_start_pos(buf); avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) break; -- cgit v1.2.3 From d24de76af836260a99ca2ba281a937bd5bc55591 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Jun 2020 07:14:43 +0200 Subject: block: remove the error argument to the block_bio_complete tracepoint The status can be trivially derived from the bio itself. That also avoid callers like NVMe to incorrectly pass a blk_status_t instead of the errno, and the overhead of translating the blk_status_t to the errno in the I/O completion fast path when no tracing is enabled. Fixes: 35fe0d12c8a3 ("nvme: trace bio completion") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ea47f2084087..1e5499414cdf 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -885,10 +885,10 @@ static void blk_add_trace_bio_bounce(void *ignore, } static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio, - int error) + struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, + blk_status_to_errno(bio->bi_status)); } static void blk_add_trace_bio_backmerge(void *ignore, -- cgit v1.2.3 From 48bc3cd3e07a1486f45d9971c75d6090976c3b1b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 4 Jun 2020 00:13:28 -0700 Subject: blktrace: use errno instead of bi_status In blk_add_trace_spliti() blk_add_trace_bio_remap() use blk_status_to_errno() to pass the error instead of pasing the bi_status. This fixes the sparse warning. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1e5499414cdf..e2013bd0e2a6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -995,8 +995,10 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu, blk_trace_bio_get_cgid(q, bio)); + BLK_TA_SPLIT, + blk_status_to_errno(bio->bi_status), + sizeof(rpdu), &rpdu, + blk_trace_bio_get_cgid(q, bio)); } rcu_read_unlock(); } @@ -1033,7 +1035,8 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, + blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } -- cgit v1.2.3 From 71df3fd82e7cccec7b749a8607a4662d9f7febdd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 4 Jun 2020 00:13:29 -0700 Subject: blktrace: fix endianness in get_pdu_int() In function get_pdu_len() replace variable type from __u64 to __be64. This fixes sparse warning. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e2013bd0e2a6..f857b4684beb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1256,7 +1256,7 @@ static inline __u16 t_error(const struct trace_entry *ent) static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent, has_cg); + const __be64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } -- cgit v1.2.3 From 5aec598c456fe3c1b71a1202cbb42bdc2a643277 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 4 Jun 2020 00:13:30 -0700 Subject: blktrace: fix endianness for blk_log_remap() The function blk_log_remap() can be simplified by removing the call to get_pdu_remap() that copies the values into extra variable to print the data, which also fixes the endiannness warning reported by sparse. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f857b4684beb..5773f0ba7e76 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1260,17 +1260,6 @@ static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) return be64_to_cpu(*val); } -static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r, bool has_cg) -{ - const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); - __u64 sector_from = __r->sector_from; - - r->device_from = be32_to_cpu(__r->device_from); - r->device_to = be32_to_cpu(__r->device_to); - r->sector_from = be64_to_cpu(sector_from); -} - typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, bool has_cg); @@ -1410,13 +1399,13 @@ static void blk_log_with_error(struct trace_seq *s, static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { - struct blk_io_trace_remap r = { .device_from = 0, }; + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); - get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), - MAJOR(r.device_from), MINOR(r.device_from), - (unsigned long long)r.sector_from); + MAJOR(be32_to_cpu(__r->device_from)), + MINOR(be32_to_cpu(__r->device_from)), + be64_to_cpu(__r->sector_from)); } static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) -- cgit v1.2.3 From afd8d7c7f93681370f6fc2ec62f2705710eee62d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 31 May 2020 23:00:59 +0200 Subject: PM: hibernate: Add __init annotation to swsusp_header_init() 'swsusp_header_init()' is only called via 'core_initcall'. It can be marked as __init to save a few bytes of memory. Signed-off-by: Christophe JAILLET [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ca0fcb5ced71..01e2858b5fe3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1590,7 +1590,7 @@ int swsusp_unmark(void) } #endif -static int swsusp_header_init(void) +static int __init swsusp_header_init(void) { swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) -- cgit v1.2.3 From 388d8bdb87e01bcea6d0b2bf797b5f6d7b2401fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 9 Apr 2020 11:40:34 +0100 Subject: tracing: Remove obsolete PREEMPTIRQ_EVENTS kconfig option The PREEMPTIRQ_EVENTS option is unused after commit c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage"). Remove it. Note that this option is hazardous as it stands. It enables TRACE_IRQFLAGS event on non-preempt configurations without the irqsoff tracer enabled. TRACE_IRQFLAGS as it stands incurs significant overhead on each IRQ entry/exit. This is because trace_hardirqs_[on|off] does all the per-cpu manipulations and NMI checks even if tracing is completely disabled for some insane reason. For example, netperf running UDP_STREAM on localhost incurs a 4-6% performance penalty without any tracing if IRQFLAGS is set. It can be put behind a static brach but even the function entry/exit costs a little bit. Link: https://lkml.kernel.org/r/20200409104034.GJ3818@techsingularity.net Signed-off-by: Mel Gorman Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 75407d5dc83a..0c82ac2c5688 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -258,15 +258,6 @@ config TRACE_PREEMPT_TOGGLE Enables hooks which will be called when preemption is first disabled, and last enabled. -config PREEMPTIRQ_EVENTS - bool "Enable trace events for preempt and irq disable/enable" - select TRACE_IRQFLAGS - select TRACE_PREEMPT_TOGGLE if PREEMPTION - select GENERIC_TRACER - default n - help - Enable tracing of disable and enable events for preemption and irqs. - config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n -- cgit v1.2.3 From 8dfb61dcbaceb19a5ded5e9c9dcf8d05acc32294 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 5 Jun 2020 10:39:55 +0300 Subject: kbuild: add variables for compression tools Allow user to use alternative implementations of compression tools, such as pigz, pbzip2, pxz. For example, multi-threaded tools to speed up the build: $ make GZIP=pigz BZIP2=pbzip2 Variables _GZIP, _BZIP2, _LZOP are used internally because original env vars are reserved by the tools. The use of GZIP in gzip tool is obsolete since 2015. However, alternative implementations (e.g., pigz) still rely on it. BZIP2, BZIP, LZOP vars are not obsolescent. The credit goes to @grsecurity. As a sidenote, for multi-threaded lzma, xz compression one can use: $ export XZ_OPT="--threads=0" Signed-off-by: Denis Efremov Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index e13ca842eb7e..c1510f0ab3ea 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -88,7 +88,7 @@ find $cpio_dir -type f -print0 | find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ --owner=0 --group=0 --numeric-owner --no-recursion \ - -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null + -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 -- cgit v1.2.3 From 7ff0d4490ebadae8037a079a70c5cac13385a808 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Jun 2020 07:52:37 +0200 Subject: trace: fix an incorrect __user annotation on stack_trace_sysctl No user pointers for sysctls anymore. Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Reported-by: build test robot Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/trace/trace_stack.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c557f42a9397..98bba4764c52 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -515,9 +515,8 @@ static const struct file_operations stack_trace_filter_fops = { #endif /* CONFIG_DYNAMIC_FTRACE */ int -stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int was_enabled; int ret; -- cgit v1.2.3 From db38d5c106dfdd7cb7207c83267d82fdf4950b61 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:17 -0700 Subject: kernel: add panic_on_taint Analogously to the introduction of panic_on_warn, this patch introduces a kernel option named panic_on_taint in order to provide a simple and generic way to stop execution and catch a coredump when the kernel gets tainted by any given flag. This is useful for debugging sessions as it avoids having to rebuild the kernel to explicitly add calls to panic() into the code sites that introduce the taint flags of interest. For instance, if one is interested in proceeding with a post-mortem analysis at the point a given code path is hitting a bad page (i.e. unaccount_page_cache_page(), or slab_bug()), a coredump can be collected by rebooting the kernel with 'panic_on_taint=0x20' amended to the command line. Another, perhaps less frequent, use for this option would be as a means for assuring a security policy case where only a subset of taints, or no single taint (in paranoid mode), is allowed for the running system. The optional switch 'nousertaint' is handy in this particular scenario, as it will avoid userspace induced crashes by writes to sysctl interface /proc/sys/kernel/tainted causing false positive hits for such policies. [akpm@linux-foundation.org: tweak kernel-parameters.txt wording] Suggested-by: Qian Cai Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Dave Young Cc: Baoquan He Cc: Jonathan Corbet Cc: Kees Cook Cc: Randy Dunlap Cc: "Theodore Ts'o" Cc: Adrian Bunk Cc: Greg Kroah-Hartman Cc: Laura Abbott Cc: Jeff Mahoney Cc: Jiri Kosina Cc: Takashi Iwai Link: http://lkml.kernel.org/r/20200515175502.146720-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- kernel/panic.c | 34 ++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 11 ++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b69ee9e76cb2..94b5c973770c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -44,6 +44,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +unsigned long panic_on_taint; +bool panic_on_taint_nousertaint = false; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -434,6 +436,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); + + if (tainted_mask & panic_on_taint) { + panic_on_taint = 0; + panic("panic_on_taint set ..."); + } } EXPORT_SYMBOL(add_taint); @@ -686,3 +693,30 @@ static int __init oops_setup(char *s) return 0; } early_param("oops", oops_setup); + +static int __init panic_on_taint_setup(char *s) +{ + char *taint_str; + + if (!s) + return -EINVAL; + + taint_str = strsep(&s, ","); + if (kstrtoul(taint_str, 16, &panic_on_taint)) + return -EINVAL; + + /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ + panic_on_taint &= TAINT_FLAGS_MAX; + + if (!panic_on_taint) + return -EINVAL; + + if (s && !strcmp(s, "nousertaint")) + panic_on_taint_nousertaint = true; + + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", + panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + + return 0; +} +early_param("panic_on_taint", panic_on_taint_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 715774d8c55f..587ed0494f2f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -866,11 +866,20 @@ static int proc_taint(struct ctl_table *table, int write, return err; if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + /* * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) add_taint(i, LOCKDEP_STILL_OK); -- cgit v1.2.3 From b467f3ef3c50c4fa8926ca07f7db9a33a645e13a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:31 -0700 Subject: kernel/hung_task convert hung_task_panic boot parameter to sysctl We can now handle sysctl parameters on kernel command line and have infrastructure to convert legacy command line options that duplicate sysctl to become a sysctl alias. This patch converts the hung_task_panic parameter. Note that the sysctl handler is more strict and allows only 0 and 1, while the legacy parameter allowed any non-zero value. But there is little reason anyone would not be using 1. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-4-vbabka@suse.cz Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c16cb3..b22b5eeab3cb 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,16 +63,6 @@ static struct task_struct *watchdog_task; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int __init hung_task_panic_setup(char *str) -{ - int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); - - if (rc) - return rc; - return 1; -} -__setup("hung_task_panic=", hung_task_panic_setup); - static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { -- cgit v1.2.3 From f117955a2255721a6a0e9cecf6cad3a6eb43cbc3 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:42 -0700 Subject: kernel/watchdog.c: convert {soft/hard}lockup boot parameters to sysctl aliases After a recent change introduced by Vlastimil's series [0], kernel is able now to handle sysctl parameters on kernel command line; also, the series introduced a simple infrastructure to convert legacy boot parameters (that duplicate sysctls) into sysctl aliases. This patch converts the watchdog parameters softlockup_panic and {hard,soft}lockup_all_cpu_backtrace to use the new alias infrastructure. It fixes the documentation too, since the alias only accepts values 0 or 1, not the full range of integers. We also took the opportunity here to improve the documentation of the previously converted hung_task_panic (see the patch series [0]) and put the alias table in alphabetical order. [0] http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Link: http://lkml.kernel.org/r/20200507214624.21911-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 53ff2c81b084..5abb5b22ad13 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -50,6 +50,11 @@ struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR + +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; +# endif /* CONFIG_SMP */ + /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -82,16 +87,6 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -# ifdef CONFIG_SMP -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; - -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -# endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* @@ -163,6 +158,10 @@ static void lockup_detector_update_enable(void) #define SOFTLOCKUP_RESET ULONG_MAX +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#endif + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -178,13 +177,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; @@ -206,17 +198,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; - -static int __init softlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#endif - static void __lockup_detector_cleanup(void); /* -- cgit v1.2.3 From 0ec9dc9bcba0a62b0844e54c1caf6b8b0bf6b5b4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:45 -0700 Subject: kernel/hung_task.c: introduce sysctl to print all traces when a hung task is detected Commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks before panic") introduced a change in that we started to show all CPUs backtraces when a hung task is detected _and_ the sysctl/kernel parameter "hung_task_panic" is set. The idea is good, because usually when observing deadlocks (that may lead to hung tasks), the culprit is another task holding a lock and not necessarily the task detected as hung. The problem with this approach is that dumping backtraces is a slightly expensive task, specially printing that on console (and specially in many CPU machines, as servers commonly found nowadays). So, users that plan to collect a kdump to investigate the hung tasks and narrow down the deadlock definitely don't need the CPUs backtrace on dmesg/console, which will delay the panic and pollute the log (crash tool would easily grab all CPUs traces with 'bt -a' command). Also, there's the reciprocal scenario: some users may be interested in seeing the CPUs backtraces but not have the system panic when a hung task is detected. The current approach hence is almost as embedding a policy in the kernel, by forcing the CPUs backtraces' dump (only) on hung_task_panic. This patch decouples the panic event on hung task from the CPUs backtraces dump, by creating (and documenting) a new sysctl called "hung_task_all_cpu_backtrace", analog to the approach taken on soft/hard lockups, that have both a panic and an "all_cpu_backtrace" sysctl to allow individual control. The new mechanism for dumping the CPUs backtraces on hung task detection respects "hung_task_warnings" by not dumping the traces in case there's no warnings left. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Tetsuo Handa Link: http://lkml.kernel.org/r/20200327223646.20779-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 20 ++++++++++++++++++-- kernel/sysctl.c | 11 +++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b22b5eeab3cb..ce76f490126c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; +static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in a hung task event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: @@ -127,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; + + if (sysctl_hung_task_all_cpu_backtrace) + hung_task_show_all_bt = true; } touch_nmi_watchdog(); @@ -191,10 +203,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); - if (hung_task_call_panic) { + + if (hung_task_show_all_bt) { + hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); } + + if (hung_task_call_panic) + panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 587ed0494f2f..34c1278951b9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2437,6 +2437,17 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_DETECT_HUNG_TASK +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, -- cgit v1.2.3 From 60c958d8df9cfc40b745d6cd583cfbfa7525ead6 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:48 -0700 Subject: panic: add sysctl to dump all CPUs backtraces on oops event Usually when the kernel reaches an oops condition, it's a point of no return; in case not enough debug information is available in the kernel splat, one of the last resorts would be to collect a kernel crash dump and analyze it. The problem with this approach is that in order to collect the dump, a panic is required (to kexec-load the crash kernel). When in an environment of multiple virtual machines, users may prefer to try living with the oops, at least until being able to properly shutdown their VMs / finish their important tasks. This patch implements a way to collect a bit more debug details when an oops event is reached, by printing all the CPUs backtraces through the usage of NMIs (on architectures that support that). The sysctl added (and documented) here was called "oops_all_cpu_backtrace", and when set will (as the name suggests) dump all CPUs backtraces. Far from ideal, this may be the last option though for users that for some reason cannot panic on oops. Most of times oopses are clear enough to indicate the kernel portion that must be investigated, but in virtual environments it's possible to observe hypervisor/KVM issues that could lead to oopses shown in other guests CPUs (like virtual APIC crashes). This patch hence aims to help debug such complex issues without resorting to kdump. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Iurii Zaikin Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Randy Dunlap Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200327224116.21030-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- kernel/panic.c | 11 +++++++++++ kernel/sysctl.c | 11 +++++++++++ 2 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 94b5c973770c..85568bbfb12b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,14 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in an oops event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; @@ -522,6 +530,9 @@ void oops_enter(void) /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); + + if (sysctl_oops_all_cpu_backtrace) + trigger_all_cpu_backtrace(); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34c1278951b9..f69d581d39c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2150,6 +2150,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SMP + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, -- cgit v1.2.3 From e77132e75845470065768e2205727e6be52cb7f4 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:51 -0700 Subject: kernel/sysctl.c: ignore out-of-range taint bits introduced via kernel.tainted Users with SYS_ADMIN capability can add arbitrary taint flags to the running kernel by writing to /proc/sys/kernel/tainted or issuing the command 'sysctl -w kernel.tainted=...'. This interface, however, is open for any integer value and this might cause an invalid set of flags being committed to the tainted_mask bitset. This patch introduces a simple way for proc_taint() to ignore any eventual invalid bit coming from the user input before committing those bits to the kernel tainted_mask. Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: "Theodore Ts'o" Link: http://lkml.kernel.org/r/20200512223946.888020-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f69d581d39c3..db1ce7af2563 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -880,10 +880,9 @@ static int proc_taint(struct ctl_table *table, int write, * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) add_taint(i, LOCKDEP_STILL_OK); - } } return err; -- cgit v1.2.3 From dadbb612f6e50bbf9101c2f5d82690ce9ea4d66b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sun, 7 Jun 2020 21:40:55 -0700 Subject: mm/gup.c: convert to use get_user_{page|pages}_fast_only() API __get_user_pages_fast() renamed to get_user_pages_fast_only() to align with pin_user_pages_fast_only(). As part of this we will get rid of write parameter. Instead caller will pass FOLL_WRITE to get_user_pages_fast_only(). This will not change any existing functionality of the API. All the callers are changed to pass FOLL_WRITE. Also introduce get_user_page_fast_only(), and use it in a few places that hard-code nr_pages to 1. Updated the documentation of the API. Signed-off-by: Souptick Joarder Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Paul Mackerras [arch/powerpc/kvm] Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Stephen Rothwell Cc: Mike Rapoport Cc: Aneesh Kumar K.V Cc: Michal Suchanek Link: http://lkml.kernel.org/r/1590396812-31277-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Linus Torvalds --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fcfadecd3a08..63d66bbebbd5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6934,12 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. - * Try IRQ-safe __get_user_pages_fast first. + * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (get_user_page_fast_only(virt, 0, &p)) phys_addr = page_to_phys(p) + virt % PAGE_SIZE; pagefault_enable(); } -- cgit v1.2.3 From 885f7f8e30468705926b3de53d32925bd7a51a03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:22 -0700 Subject: mm: rename flush_icache_user_range to flush_icache_user_page The function currently known as flush_icache_user_range only operates on a single page. Rename it to flush_icache_user_page as we'll need the name flush_icache_user_range for something else soon. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Geert Uytterhoeven Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Vincent Chen Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20200515143646.3857579-20-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eddc8db96027..e51ec844c87c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1668,7 +1668,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, copy_to_page(page, vaddr, src, len); /* - * We probably need flush_icache_user_range() but it needs vma. + * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. -- cgit v1.2.3 From 490741ab1beb9c9f6580e30c7cd91b17a7560369 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:52 -0700 Subject: module: move the set_fs hack for flush_icache_range to m68k flush_icache_range generally operates on kernel addresses, but for some reason m68k needed a set_fs override. Move that into the m68k code insted of keeping it in the module loader. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Jessica Yu Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Link: http://lkml.kernel.org/r/20200515143646.3857579-30-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/module.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ef400c389f49..e8a198588f26 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3344,12 +3344,6 @@ static int check_module_license_and_versions(struct module *mod) static void flush_module_icache(const struct module *mod) { - mm_segment_t old_fs; - - /* flush the icache in correct context */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* * Flush the instruction cache, since we've played with text. * Do it before processing of module parameters, so the module @@ -3361,8 +3355,6 @@ static void flush_module_icache(const struct module *mod) + mod->init_layout.size); flush_icache_range((unsigned long)mod->core_layout.base, (unsigned long)mod->core_layout.base + mod->core_layout.size); - - set_fs(old_fs); } int __weak module_frob_arch_sections(Elf_Ehdr *hdr, -- cgit v1.2.3 From 3ee06a6d532f75f20528ff4d2c473cda36c484fe Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 8 Jun 2020 15:22:17 +0200 Subject: dma-pool: fix too large DMA pools on medium memory size systems On systems with at least 32 MiB, but less than 32 GiB of RAM, the DMA memory pools are much larger than intended (e.g. 2 MiB instead of 128 KiB on a 256 MiB system). Fix this by correcting the calculation of the number of GiBs of RAM in the system. Invert the order of the min/max operations, to keep on calculating in pages until the last step, which aids readability. Fixes: 1d659236fb43c4d2 ("dma-pool: scale the default DMA coherent pool size with memory capacity") Signed-off-by: Geert Uytterhoeven Acked-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 35bb51c31fff..8cfa01243ed2 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -175,10 +175,9 @@ static int __init dma_atomic_pool_init(void) * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. */ if (!atomic_pool_size) { - atomic_pool_size = max(totalram_pages() >> PAGE_SHIFT, 1UL) * - SZ_128K; - atomic_pool_size = min_t(size_t, atomic_pool_size, - 1 << (PAGE_SHIFT + MAX_ORDER-1)); + unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K); + pages = min_t(unsigned long, pages, MAX_ORDER_NR_PAGES); + atomic_pool_size = max_t(size_t, pages << PAGE_SHIFT, SZ_128K); } INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); -- cgit v1.2.3 From c7f3d43b629b598a2bb9ec3524e844eae7492e7e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Jun 2020 23:51:15 +0200 Subject: clocksource: Remove obsolete ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE was a transitional config switch which got removed after all architectures got converted to the new storage model. But the removal forgot to remove the #ifdef which guards the vdso_clock_mode sanity check, which effectively disables the sanity check. Remove it now. Fixes: f86fd32db706 ("lib/vdso: Cleanup clock mode storage leftovers") Signed-off-by: Thomas Gleixner Tested-by: Miklos Szeredi Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200606221531.845475036@linutronix.de --- kernel/time/clocksource.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7cb09c4cf21c..02441ead3c3b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -928,14 +928,12 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_arch_init(cs); -#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE if (cs->vdso_clock_mode < 0 || cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", cs->name, cs->vdso_clock_mode); cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; } -#endif /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); -- cgit v1.2.3 From 2062a4e8ae9f486847652927aaf88e21ab8d195d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:29:56 -0700 Subject: kallsyms/printk: add loglvl to print_ip_sym() Patch series "Add log level to show_stack()", v3. Add log level argument to show_stack(). Done in three stages: 1. Introducing show_stack_loglvl() for every architecture 2. Migrating old users with an explicit log level 3. Renaming show_stack_loglvl() into show_stack() Justification: - It's a design mistake to move a business-logic decision into platform realization detail. - I have currently two patches sets that would benefit from this work: Removing console_loglevel jumps in sysrq driver [1] Hung task warning before panic [2] - suggested by Tetsuo (but he probably didn't realise what it would involve). - While doing (1), (2) the backtraces were adjusted to headers and other messages for each situation - so there won't be a situation when the backtrace is printed, but the headers are missing because they have lesser log level (or the reverse). - As the result in (2) plays with console_loglevel for kdb are removed. The least important for upstream, but maybe still worth to note that every company I've worked in so far had an off-list patch to print backtrace with the needed log level (but only for the architecture they cared about). If you have other ideas how you will benefit from show_stack() with a log level - please, reply to this cover letter. See also discussion on v1: https://lore.kernel.org/linux-riscv/20191106083538.z5nlpuf64cigxigh@pathway.suse.cz/ This patch (of 50): print_ip_sym() needs to have a log level parameter to comply with other parts being printed. Otherwise, half of the expected backtrace would be printed and other may be missing with some logging level. The following callee(s) are using now the adjusted log level: - microblaze/unwind: the same level as headers & userspace unwind. Note that pr_debug()'s there are for debugging the unwinder itself. - nds32/traps: symbol addresses are printed with the same log level as backtrace headers. - lockdep: ip for locking issues is printed with the same log level as other part of the warning. - sched: ip where preemption was disabled is printed as error like the rest part of the message. - ftrace: bug reports are now consistent in the log level being used. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Acked-by: Steven Rostedt (VMware) Cc: Albert Ou Cc: Ben Segall Cc: Dietmar Eggemann Cc: Greentime Hu Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Hogan Cc: Juri Lelli Cc: Mel Gorman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Burton Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vincent Chen Cc: Vincent Guittot Cc: Will Deacon Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Dmitry Safonov Cc: Jiri Slaby Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Aurelien Jacquiot Cc: Mark Salter Cc: Guo Ren Cc: Yoshinori Sato Cc: Brian Cain Cc: Fenghua Yu Cc: Tony Luck Cc: Geert Uytterhoeven Cc: Ley Foon Tan Cc: Jonas Bonn Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Rich Felker Cc: "David S. Miller" Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Cc: Guan Xuetao Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Chris Zankel Cc: Max Filippov Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Rafael J. Wysocki" Cc: Daniel Thompson Cc: Douglas Anderson Cc: Jason Wessel Link: http://lkml.kernel.org/r/20200418201944.482088-2-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/locking/lockdep.c | 4 ++-- kernel/sched/core.c | 6 ++---- kernel/trace/ftrace.c | 8 ++++---- 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4c057dd8e93b..38cce34d03dc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4424,7 +4424,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no more locks to release!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -5075,7 +5075,7 @@ static void print_lock_contention_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no locks held!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8298b2c240ce..c06da3c3e317 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3922,8 +3922,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } if (panic_on_warn) panic("scheduling while atomic\n"); @@ -6871,8 +6870,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b5765aeea698..7d0ebd104706 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2020,12 +2020,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EFAULT: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; case -EINVAL: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); if (ftrace_expected) { @@ -2036,12 +2036,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EPERM: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; default: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); } print_bug_type(); if (rec) { -- cgit v1.2.3 From 77819daf247aad16beaeb537ae77d1d6d0697ca2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:19 -0700 Subject: kdb: don't play with console_loglevel Print the stack trace with KERN_EMERG - it should be always visible. Playing with console_loglevel is a bad idea as there may be more messages printed than wanted. Also the stack trace might be not printed at all if printk() was deferred and console_loglevel was raised back before the trace got flushed. Unfortunately, after rebasing on commit 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master"), kdb_show_stack() uses now kdb_dump_stack_on_cpu(), which for now won't be converted as it uses dump_stack() instead of show_stack(). Convert for now the branch that uses show_stack() and remove console_loglevel exercise from that case. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Reviewed-by: Douglas Anderson Acked-by: Daniel Thompson Cc: Jason Wessel Link: http://lkml.kernel.org/r/20200418201944.482088-48-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_bt.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 3de0cc780c16..43f5dcd2b9ac 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,17 +21,18 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { - int old_lvl = console_loglevel; - - console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - if (!addr && kdb_task_has_cpu(p)) + if (!addr && kdb_task_has_cpu(p)) { + int old_lvl = console_loglevel; + + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_dump_stack_on_cpu(kdb_process_cpu(p)); - else - show_stack(p, addr); + console_loglevel = old_lvl; + } else { + show_stack_loglvl(p, addr, KERN_EMERG); + } - console_loglevel = old_lvl; kdb_trap_printk--; } -- cgit v1.2.3 From 8ba09b1dc131ff9bb530967c593e298c600f72c0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:23 -0700 Subject: sched: print stack trace with KERN_INFO Aligning with other messages printed in sched_show_task() - use KERN_INFO to print the backtrace. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Ben Segall Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20200418201944.482088-49-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c06da3c3e317..c68a6e7b306f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + show_stack_loglvl(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); -- cgit v1.2.3 From fe1993a001094a596576334c56e7a10e4cd69e6c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:26 -0700 Subject: kernel: use show_stack_loglvl() Align the last users of show_stack() by KERN_DEFAULT as the surrounding headers/messages. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-50-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/locking/rtmutex-debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index fd4fe1f5b458..5e63d6e8a223 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack(task, NULL); + show_stack_loglvl(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); -- cgit v1.2.3 From 9cb8f069deeed708bf19486d5893e297dc467ae0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:29 -0700 Subject: kernel: rename show_stack_loglvl() => show_stack() Now the last users of show_stack() got converted to use an explicit log level, show_stack_loglvl() can drop it's redundant suffix and become once again well known show_stack(). Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200418201944.482088-51-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_bt.c | 2 +- kernel/locking/rtmutex-debug.c | 2 +- kernel/sched/core.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 43f5dcd2b9ac..18e03aba2cfc 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -30,7 +30,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) kdb_dump_stack_on_cpu(kdb_process_cpu(p)); console_loglevel = old_lvl; } else { - show_stack_loglvl(p, addr, KERN_EMERG); + show_stack(p, addr, KERN_EMERG); } kdb_trap_printk--; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 5e63d6e8a223..36e69100e8e0 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack_loglvl(task, NULL, KERN_DEFAULT); + show_stack(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c68a6e7b306f..8f360326861e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack_loglvl(p, NULL, KERN_INFO); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); -- cgit v1.2.3 From e31cf2f4ca422ac9b14ecc4a1295b8977a20f812 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:33 -0700 Subject: mm: don't include asm/pgtable.h if linux/mm.h is already included Patch series "mm: consolidate definitions of page table accessors", v2. The low level page table accessors (pXY_index(), pXY_offset()) are duplicated across all architectures and sometimes more than once. For instance, we have 31 definition of pgd_offset() for 25 supported architectures. Most of these definitions are actually identical and typically it boils down to, e.g. static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } These definitions can be shared among 90% of the arches provided XYZ_SHIFT, PTRS_PER_XYZ and xyz_page_vaddr() are defined. For architectures that really need a custom version there is always possibility to override the generic version with the usual ifdefs magic. These patches introduce include/linux/pgtable.h that replaces include/asm-generic/pgtable.h and add the definitions of the page table accessors to the new header. This patch (of 12): The linux/mm.h header includes to allow inlining of the functions involving page table manipulations, e.g. pte_alloc() and pmd_alloc(). So, there is no point to explicitly include in the files that include . The include statements in such cases are remove with a simple loop: for f in $(git grep -l "include ") ; do sed -i -e '/include / d' $f done Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200514170327.31389-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/power/snapshot.c | 1 - 3 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c300253a7b8e..1aecef938822 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -66,7 +66,6 @@ #include #include -#include #include static void __unhash_process(struct task_struct *p, bool group_dead) diff --git a/kernel/fork.c b/kernel/fork.c index cefe8745c46e..3603e14474cd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,7 +96,6 @@ #include #include -#include #include #include #include diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 659800157b17..881128b9351e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -34,7 +34,6 @@ #include #include -#include #include #include -- cgit v1.2.3 From ca5999fde0a1761665a38e4c9a72dbcd7d190a81 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:38 -0700 Subject: mm: introduce include/linux/pgtable.h The include/linux/pgtable.h is going to be the home of generic page table manipulation functions. Start with moving asm-generic/pgtable.h to include/linux/pgtable.h and make the latter include asm/pgtable.h. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-3-rppt@kernel.org Signed-off-by: Linus Torvalds --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d530b1d5683..c3ae2adaeccd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From d8ed45c5dcd455fc5848d47f86883a1b872ac0d0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:25 -0700 Subject: mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds --- kernel/acct.c | 4 ++-- kernel/bpf/stackmap.c | 4 ++-- kernel/events/core.c | 4 ++-- kernel/events/uprobes.c | 16 ++++++++-------- kernel/exit.c | 8 ++++---- kernel/fork.c | 12 ++++++------ kernel/futex.c | 4 ++-- kernel/sched/fair.c | 4 ++-- kernel/sys.c | 18 +++++++++--------- kernel/trace/trace_output.c | 4 ++-- 10 files changed, 39 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 11ff4a596d6b..c530568dd51c 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -541,13 +541,13 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = current->mm->mmap; while (vma) { vsize += vma->vm_end - vma->vm_start; vma = vma->vm_next; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); } spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 7b8381ce40a0..a13b7e28eaf8 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -317,7 +317,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - down_read_trylock(¤t->mm->mmap_sem) == 0) { + mmap_read_trylock(current->mm) == 0) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -342,7 +342,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); } else { work->sem = ¤t->mm->mmap_sem; irq_work_queue(&work->irq_work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 63d66bbebbd5..2861addad657 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9784,7 +9784,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (!mm) goto restart; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); @@ -9810,7 +9810,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e51ec844c87c..cd82e1ba6b9b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1058,7 +1058,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) @@ -1080,7 +1080,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) } unlock: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); @@ -1235,7 +1235,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long vaddr; loff_t offset; @@ -1252,7 +1252,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, mm, vaddr); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; } @@ -1439,7 +1439,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { @@ -1469,7 +1469,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } @@ -2039,7 +2039,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct uprobe *uprobe = NULL; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { @@ -2057,7 +2057,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) mmf_recalc_uprobes(mm); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return uprobe; } diff --git a/kernel/exit.c b/kernel/exit.c index 1aecef938822..36cbaa43ac80 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -445,12 +445,12 @@ static void exit_mm(void) * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); core_state = mm->core_state; if (core_state) { struct core_thread self; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); self.task = current; self.next = xchg(&core_state->dumper.next, &self); @@ -468,14 +468,14 @@ static void exit_mm(void) freezable_schedule(); } __set_current_state(TASK_RUNNING); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); current->mm = NULL; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); enter_lazy_tlb(mm, current); task_unlock(current); mm_update_next_owner(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 3603e14474cd..d61751ba10dc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -492,7 +492,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, LIST_HEAD(uf); uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { + if (mmap_write_lock_killable(oldmm)) { retval = -EINTR; goto fail_uprobe_end; } @@ -617,9 +617,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); + mmap_write_unlock(oldmm); dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); @@ -649,9 +649,9 @@ static inline void mm_free_pgd(struct mm_struct *mm) #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - down_write(&oldmm->mmap_sem); + mmap_write_lock(oldmm); RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); + mmap_write_unlock(oldmm); return 0; } #define mm_alloc_pgd(mm) (0) @@ -1022,7 +1022,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); - init_rwsem(&mm->mmap_sem); + mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; mm_pgtables_bytes_init(mm); diff --git a/kernel/futex.c b/kernel/futex.c index b4b9f960b610..e646661f6282 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -698,10 +698,10 @@ static int fault_in_user_writeable(u32 __user *uaddr) struct mm_struct *mm = current->mm; int ret; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return ret < 0 ? ret : 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35f4cc024dcf..cbcb2f71599b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2770,7 +2770,7 @@ static void task_numa_work(struct callback_head *work) return; - if (!down_read_trylock(&mm->mmap_sem)) + if (!mmap_read_trylock(mm)) return; vma = find_vma(mm, start); if (!vma) { @@ -2838,7 +2838,7 @@ out: mm->numa_scan_offset = start; else reset_ptenuma_scan(p); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Make sure tasks use at least 32x as much time to run other code diff --git a/kernel/sys.c b/kernel/sys.c index 891667a49bb7..12805750a66c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1846,7 +1846,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (exe_file) { struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!vma->vm_file) continue; @@ -1855,7 +1855,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) goto exit_err; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); fput(exe_file); } @@ -1869,7 +1869,7 @@ exit: fdput(exe); return err; exit_err: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); fput(exe_file); goto exit; } @@ -2010,7 +2010,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * arg_lock protects concurent updates but we still need mmap_sem for * read to exclude races with sys_brk. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); /* * We don't validate if these members are pointing to @@ -2049,7 +2049,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -2125,7 +2125,7 @@ static int prctl_set_mm(int opt, unsigned long addr, * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr * validation. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, addr); spin_lock(&mm->arg_lock); @@ -2217,7 +2217,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: spin_unlock(&mm->arg_lock); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return error; } @@ -2442,13 +2442,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; - if (down_write_killable(&me->mm->mmap_sem)) + if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) set_bit(MMF_DISABLE_THP, &me->mm->flags); else clear_bit(MMF_DISABLE_THP, &me->mm->flags); - up_write(&me->mm->mmap_sem); + mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 9a121e147102..73976de7f8cc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -393,7 +393,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, if (mm) { const struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, ip); if (vma) { file = vma->vm_file; @@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, trace_seq_printf(s, "[+0x%lx]", ip - vmstart); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) trace_seq_printf(s, " <" IP_FMT ">", ip); -- cgit v1.2.3 From aaa2cc56c1cd757efec88a4978ffce4cbf884352 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:33 -0700 Subject: mmap locking API: convert nested write lock sites Add API for nested write locks and convert the few call sites doing that. Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-7-walken@google.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d61751ba10dc..142b23645d82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -501,7 +501,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* * Not linked in yet - no deadlock potential: */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); -- cgit v1.2.3 From 0cc55a0213a02b760ade1d4755fdccfbf7d3157e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:37 -0700 Subject: mmap locking API: add mmap_read_trylock_non_owner() Add a couple APIs used by kernel/bpf/stackmap.c only: - mmap_read_trylock_non_owner() - mmap_read_unlock_non_owner() (may be called from a work queue). It's still not ideal that bpf/stackmap subverts the lock ownership in this way. Thanks to Peter Zijlstra for suggesting this API as the least-ugly way of addressing this in the short term. Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Laurent Dufour Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-8-walken@google.com Signed-off-by: Linus Torvalds --- kernel/bpf/stackmap.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a13b7e28eaf8..599488f25e40 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -33,7 +33,7 @@ struct bpf_stack_map { /* irq_work to run up_read() for build_id lookup in nmi context */ struct stack_map_irq_work { struct irq_work irq_work; - struct rw_semaphore *sem; + struct mm_struct *mm; }; static void do_up_read(struct irq_work *entry) @@ -44,8 +44,7 @@ static void do_up_read(struct irq_work *entry) return; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read_non_owner(work->sem); - work->sem = NULL; + mmap_read_unlock_non_owner(work->mm); } static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); @@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - mmap_read_trylock(current->mm) == 0) { + !mmap_read_trylock_non_owner(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock(current->mm); + mmap_read_unlock_non_owner(current->mm); } else { - work->sem = ¤t->mm->mmap_sem; + work->mm = current->mm; irq_work_queue(&work->irq_work); - /* - * The irq_work will release the mmap_sem with - * up_read_non_owner(). The rwsem_release() is called - * here to release the lock from lockdep's perspective. - */ - rwsem_release(¤t->mm->mmap_sem.dep_map, _RET_IP_); } } -- cgit v1.2.3 From c1e8d7c6a7a682e1405e3e242d32fc377fd196ff Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:54 -0700 Subject: mmap locking API: convert mmap_sem comments Convert comments that reference mmap_sem to reference mmap_lock instead. [akpm@linux-foundation.org: fix up linux-next leftovers] [akpm@linux-foundation.org: s/lockaphore/lock/, per Vlastimil] [akpm@linux-foundation.org: more linux-next fixups, per Michel] Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Vlastimil Babka Reviewed-by: Daniel Jordan Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Laurent Dufour Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-13-walken@google.com Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- kernel/cgroup/cpuset.c | 4 ++-- kernel/events/core.c | 6 +++--- kernel/events/uprobes.c | 4 ++-- kernel/exit.c | 2 +- kernel/relay.c | 2 +- kernel/sys.c | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index c530568dd51c..b0c5b3a9f5af 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -40,7 +40,7 @@ * is one more bug... 10/11/98, AV. * * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold - * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks + * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks * a struct file opened for write. Fixed. 2/6/2000, AV. */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 729d3a5c772e..642415b8c3c9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1655,7 +1655,7 @@ static void update_tasks_nodemask(struct cpuset *cs) guarantee_online_mems(cs, &newmems); /* - * The mpol_rebind_mm() call takes mmap_sem, which we couldn't + * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold @@ -1760,7 +1760,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, diff --git a/kernel/events/core.c b/kernel/events/core.c index 2861addad657..856d98c36f56 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1316,7 +1316,7 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event::child_mutex; * perf_event_context::lock * perf_event::mmap_mutex - * mmap_sem + * mmap_lock * perf_addr_filters_head::lock * * cpu_hotplug_lock @@ -3080,7 +3080,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly - * registered mapping, called for every new mmap(), with mm::mmap_sem down + * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. @@ -9742,7 +9742,7 @@ static void perf_addr_filters_splice(struct perf_event *event, /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. - * Called with mm::mmap_sem down for reading. + * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cd82e1ba6b9b..4f4993dcf17a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -457,7 +457,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held for write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@ -1349,7 +1349,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. + * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. diff --git a/kernel/exit.c b/kernel/exit.c index 36cbaa43ac80..727150f28103 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -440,7 +440,7 @@ static void exit_mm(void) sync_mm_rss(mm); /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state + * We must hold mmap_lock around checking core_state * and clearing tsk->mm. The core-inducing thread * will increment ->nr_threads for each thread in the * group with ->mm != NULL. diff --git a/kernel/relay.c b/kernel/relay.c index 204867220f8a..72fe443ea78f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static void relay_free_page_array(struct page **array) * * Returns 0 if ok, negative on error * - * Caller should already have grabbed mmap_sem. + * Caller should already have grabbed mmap_lock. */ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) { diff --git a/kernel/sys.c b/kernel/sys.c index 12805750a66c..fd46865b46ba 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2007,7 +2007,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } /* - * arg_lock protects concurent updates but we still need mmap_sem for + * arg_lock protects concurent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ mmap_read_lock(mm); @@ -2122,7 +2122,7 @@ static int prctl_set_mm(int opt, unsigned long addr, /* * arg_lock protects concurent updates of arg boundaries, we need - * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr + * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ mmap_read_lock(mm); -- cgit v1.2.3 From bd88bb5d4007949be7154deae7cef7173c751a95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:14 -0700 Subject: maccess: rename strncpy_from_unsafe_user to strncpy_from_user_nofault This matches the naming of strncpy_from_user, and also makes it more clear what the function is supposed to do. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-7-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3744372a24e2..334db5a299a0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -159,7 +159,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto = { BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, const void __user *, unsafe_ptr) { - int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size); + int ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); @@ -419,7 +419,7 @@ fmt_str: sizeof(buf)); break; case 'u': - strncpy_from_unsafe_user(buf, + strncpy_from_user_nofault(buf, (__force void __user *)unsafe_ptr, sizeof(buf)); break; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35989383ae11..d600f41fda1c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1268,7 +1268,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); - ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen); + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); -- cgit v1.2.3 From c4cb164426aebd635baa53685b0ebf1a127e9803 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:17 -0700 Subject: maccess: rename strncpy_from_unsafe_strict to strncpy_from_kernel_nofault This matches the naming of strncpy_from_user_nofault, and also makes it more clear what the function is supposed to do. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-8-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 334db5a299a0..879ee676bf5a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -240,7 +240,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, * is returned that can be used for bpf_perf_event_output() et al. */ ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : - strncpy_from_unsafe_strict(dst, unsafe_ptr, size); + strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) out: memset(dst, 0, size); @@ -415,7 +415,7 @@ fmt_str: break; #endif case 'k': - strncpy_from_unsafe_strict(buf, unsafe_ptr, + strncpy_from_kernel_nofault(buf, unsafe_ptr, sizeof(buf)); break; case 'u': -- cgit v1.2.3 From 02dddb160ec1dccb51e75f3113654a090bc3963a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:20 -0700 Subject: maccess: rename strnlen_unsafe_user to strnlen_user_nofault This matches the naming of strnlen_user, and also makes it more clear what the function is supposed to do. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-9-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d600f41fda1c..4325f9e7fada 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1221,7 +1221,7 @@ fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; - return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE); + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); } /* -- cgit v1.2.3 From d7b2977b816223a884814eea46fbe38e192cec4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:30 -0700 Subject: bpf: factor out a bpf_trace_copy_string helper Split out a helper to do the fault free access to the string pointer to get it out of a crazy indentation level. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-12-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 879ee676bf5a..60e82c7b8122 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -324,6 +324,28 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } +static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + strncpy_from_unsafe(buf, unsafe_ptr, bufsz); + break; +#endif + case 'k': + strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + break; + case 'u': + strncpy_from_user_nofault(buf, user_ptr, bufsz); + break; + } +} + /* * Only limited trace_printk() conversion specifiers allowed: * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s @@ -406,24 +428,8 @@ fmt_str: break; } - buf[0] = 0; - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - strncpy_from_unsafe(buf, unsafe_ptr, - sizeof(buf)); - break; -#endif - case 'k': - strncpy_from_kernel_nofault(buf, unsafe_ptr, - sizeof(buf)); - break; - case 'u': - strncpy_from_user_nofault(buf, - (__force void __user *)unsafe_ptr, - sizeof(buf)); - break; - } + bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, + sizeof(buf)); goto fmt_next; } -- cgit v1.2.3 From aec6ce59133edc4ac04f7d4e2556fdf047becb62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:33 -0700 Subject: bpf: handle the compat string in bpf_trace_copy_string better User the proper helper for kernel or userspace addresses based on TASK_SIZE instead of the dangerous strncpy_from_unsafe function. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-13-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 60e82c7b8122..a2efbdad434b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -334,8 +334,11 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, switch (fmt_ptype) { case 's': #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - strncpy_from_unsafe(buf, unsafe_ptr, bufsz); - break; + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + strncpy_from_user_nofault(buf, user_ptr, bufsz); + break; + } + fallthrough; #endif case 'k': strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); -- cgit v1.2.3 From 19c8d8ac63d39578483db0b82107c172d8edfb07 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 8 Jun 2020 21:34:37 -0700 Subject: bpf:bpf_seq_printf(): handle potentially unsafe format string better User the proper helper for kernel or userspace addresses based on TASK_SIZE instead of the dangerous strncpy_from_unsafe function. Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a2efbdad434b..deefe2823be1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -588,15 +588,17 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, } if (fmt[i] == 's') { + void *unsafe_ptr; + /* try our best to copy */ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { err = -E2BIG; goto out; } - err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - MAX_SEQ_PRINTF_STR_LEN); + unsafe_ptr = (void *)(long)args[fmt_cnt]; + err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], + unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); if (err < 0) bufs->buf[memcpy_cnt][0] = '\0'; params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; -- cgit v1.2.3 From 8d92db5c04d10381f4db70ed99b1b576f5db18a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:40 -0700 Subject: bpf: rework the compat kernel probe handling Instead of using the dangerous probe_kernel_read and strncpy_from_unsafe helpers, rework the compat probes to check if an address is a kernel or userspace one, and then use the low-level kernel or user probe helper shared by the proper kernel and user probe helpers. This slightly changes behavior as the compat probe on a user address doesn't check the lockdown flags, just as the pure user probes do. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-14-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 109 +++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index deefe2823be1..59cb19888891 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -136,17 +136,23 @@ static const struct bpf_func_proto bpf_override_return_proto = { }; #endif -BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, - const void __user *, unsafe_ptr) +static __always_inline int +bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) { - int ret = probe_user_read(dst, unsafe_ptr, size); + int ret; + ret = probe_user_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - return ret; } +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_common(dst, size, unsafe_ptr); +} + const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, @@ -156,17 +162,24 @@ const struct bpf_func_proto bpf_probe_read_user_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, - const void __user *, unsafe_ptr) +static __always_inline int +bpf_probe_read_user_str_common(void *dst, u32 size, + const void __user *unsafe_ptr) { - int ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); + int ret; + ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - return ret; } +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_str_common(dst, size, unsafe_ptr); +} + const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, @@ -177,25 +190,25 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = { }; static __always_inline int -bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr, - const bool compat) +bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = security_locked_down(LOCKDOWN_BPF_READ); if (unlikely(ret < 0)) - goto out; - ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) : - probe_kernel_read_strict(dst, unsafe_ptr, size); + goto fail; + ret = probe_kernel_read_strict(dst, unsafe_ptr, size); if (unlikely(ret < 0)) -out: - memset(dst, 0, size); + goto fail; + return ret; +fail: + memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_proto = { @@ -207,50 +220,37 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, - const void *, unsafe_ptr) -{ - return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true); -} - -static const struct bpf_func_proto bpf_probe_read_compat_proto = { - .func = bpf_probe_read_compat, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_ANYTHING, -}; - static __always_inline int -bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, - const bool compat) +bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = security_locked_down(LOCKDOWN_BPF_READ); if (unlikely(ret < 0)) - goto out; + goto fail; + /* - * The strncpy_from_unsafe_*() call will likely not fill the entire - * buffer, but that's okay in this circumstance as we're probing + * The strncpy_from_kernel_nofault() call will likely not fill the + * entire buffer, but that's okay in this circumstance as we're probing * arbitrary memory anyway similar to bpf_probe_read_*() and might * as well probe the stack. Thus, memory is explicitly cleared * only in error case, so that improper users ignoring return * code altogether don't copy garbage; otherwise length of string * is returned that can be used for bpf_perf_event_output() et al. */ - ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : - strncpy_from_kernel_nofault(dst, unsafe_ptr, size); + ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) -out: - memset(dst, 0, size); + goto fail; + + return 0; +fail: + memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { @@ -262,10 +262,34 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE +BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); +} + +static const struct bpf_func_proto bpf_probe_read_compat_proto = { + .func = bpf_probe_read_compat, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true); + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_str_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { @@ -276,6 +300,7 @@ static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; +#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) -- cgit v1.2.3 From 9de1fec50b23117f0a19f7609cc837ca72e764a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:44 -0700 Subject: tracing/kprobes: handle mixed kernel/userspace probes better Instead of using the dangerous probe_kernel_read and strncpy_from_unsafe helpers, rework probes to try a user probe based on the address if the architecture has a common address space for kernel and userspace. [svens@linux.ibm.com:use strncpy_from_kernel_nofault() in fetch_store_string()] Link: http://lkml.kernel.org/r/20200606181903.49384-1-svens@linux.ibm.com Signed-off-by: Christoph Hellwig Signed-off-by: Sven Schnelle Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-15-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/trace_kprobe.c | 72 +++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4325f9e7fada..6a23b45613a2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1200,6 +1200,15 @@ static const struct file_operations kprobe_profile_ops = { /* Kprobe specific fetch functions */ +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) @@ -1207,30 +1216,27 @@ fetch_store_strlen(unsigned long addr) int ret, len = 0; u8 c; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + do { - ret = probe_kernel_read(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read_strict(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); return (ret < 0) ? ret : len; } -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen_user(unsigned long addr) -{ - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); -} - /* - * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max - * length and relative data location. + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. */ static nokprobe_inline int -fetch_store_string(unsigned long addr, void *dest, void *base) +fetch_store_string_user(unsigned long addr, void *dest, void *base) { + const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); void *__dest; long ret; @@ -1240,11 +1246,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen); + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); @@ -1252,35 +1254,37 @@ fetch_store_string(unsigned long addr, void *dest, void *base) } /* - * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf - * with max length and relative data location. + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. */ static nokprobe_inline int -fetch_store_string_user(unsigned long addr, void *dest, void *base) +fetch_store_string(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); void *__dest; long ret; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + if (unlikely(!maxlen)) return -ENOMEM; __dest = get_loc_data(dest, base); - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } -static nokprobe_inline int -probe_mem_read(void *dest, void *src, size_t size) -{ - return probe_kernel_read(dest, src, size); -} - static nokprobe_inline int probe_mem_read_user(void *dest, void *src, size_t size) { @@ -1289,6 +1293,16 @@ probe_mem_read_user(void *dest, void *src, size_t size) return probe_user_read(dest, uaddr, size); } +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return probe_kernel_read_strict(dest, src, size); +} + /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, -- cgit v1.2.3 From 98a23609b10364a51a1bb3688f8dd1cd1aa94a9a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:50 -0700 Subject: maccess: always use strict semantics for probe_kernel_read Except for historical confusion in the kprobes/uprobes and bpf tracers, which has been fixed now, there is no good reason to ever allow user memory accesses from probe_kernel_read. Switch probe_kernel_read to only read from kernel memory. [akpm@linux-foundation.org: update it for "mm, dump_page(): do not crash with invalid mapping pointer"] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-17-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 2 +- kernel/trace/trace_kprobe.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 59cb19888891..e729c9e587a0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -196,7 +196,7 @@ bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) if (unlikely(ret < 0)) goto fail; - ret = probe_kernel_read_strict(dst, unsafe_ptr, size); + ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) goto fail; return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6a23b45613a2..ea8d0b094f1b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1222,7 +1222,7 @@ fetch_store_strlen(unsigned long addr) #endif do { - ret = probe_kernel_read_strict(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); @@ -1300,7 +1300,7 @@ probe_mem_read(void *dest, void *src, size_t size) if ((unsigned long)src < TASK_SIZE) return probe_mem_read_user(dest, src, size); #endif - return probe_kernel_read_strict(dest, src, size); + return probe_kernel_read(dest, src, size); } /* Note that we don't verify it, since the code does not come from user space */ -- cgit v1.2.3 From 013b2deba9a6b80ca02f4fafd7dedf875e9b4450 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 May 2020 18:47:25 +0200 Subject: uprobes: ensure that uprobe->offset and ->ref_ctr_offset are properly aligned uprobe_write_opcode() must not cross page boundary; prepare_uprobe() relies on arch_uprobe_analyze_insn() which should validate "vaddr" but some architectures (csky, s390, and sparc) don't do this. We can remove the BUG_ON() check in prepare_uprobe() and validate the offset early in __uprobe_register(). The new IS_ALIGNED() check matches the alignment check in arch_prepare_kprobe() on supported architectures, so I think that all insns must be aligned to UPROBE_SWBP_INSN_SIZE. Another problem is __update_ref_ctr() which was wrong from the very beginning, it can read/write outside of kmap'ed page unless "vaddr" is aligned to sizeof(short), __uprobe_register() should check this too. Reported-by: Linus Torvalds Suggested-by: Linus Torvalds Signed-off-by: Oleg Nesterov Reviewed-by: Srikar Dronamraju Acked-by: Christian Borntraeger Tested-by: Sven Schnelle Cc: Steven Rostedt Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e51ec844c87c..5dc2f46f38c0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -861,10 +861,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (ret) goto out; - /* uprobe_write_opcode() assumes we don't cross page boundary */ - BUG_ON((uprobe->offset & ~PAGE_MASK) + - UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); @@ -1160,6 +1156,15 @@ static int __uprobe_register(struct inode *inode, loff_t offset, if (offset > i_size_read(inode)) return -EINVAL; + /* + * This ensures that copy_from_page(), copy_to_page() and + * __update_ref_ctr() can't cross page boundary. + */ + if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) + return -EINVAL; + if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) + return -EINVAL; + retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) @@ -2008,6 +2013,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) + return -EINVAL; + pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); -- cgit v1.2.3 From 22d5bd6867364b41576a712755271a7d6161abd6 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 8 Jun 2020 14:45:32 +0200 Subject: tracing/probe: Fix bpf_task_fd_query() for kprobes and uprobes Commit 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") removed the trace_[ku]probe structure from the trace_event_call->data pointer. As bpf_get_[ku]probe_info() were forgotten in that change, fix them now. These functions are currently only used by the bpf_task_fd_query() syscall handler to collect information about a perf event. Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20200608124531.819838-1-jean-philippe@linaro.org --- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_uprobe.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35989383ae11..8eeb95e04bf5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1629,7 +1629,7 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tk = find_trace_kprobe(pevent, group); else - tk = event->tp_event->data; + tk = trace_kprobe_primary_from_call(event->tp_event); if (!tk) return -EINVAL; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a8e8e9c1c75..fdd47f99b18f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1412,7 +1412,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tu = find_probe_event(pevent, group); else - tu = event->tp_event->data; + tu = trace_uprobe_primary_from_call(event->tp_event); if (!tu) return -EINVAL; -- cgit v1.2.3 From 26afa0a4eb3fd87757f9de56ec5db5a03b14e120 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 8 Jun 2020 09:17:23 -0600 Subject: bpf: Reset data_meta before running programs attached to devmap entry This is a new context that does not handle metadata at the moment, so mark data_meta invalid. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200608151723.9539-1-dsahern@kernel.org --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 854b09beb16b..bfdff2faf5cb 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -479,6 +479,7 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp_set_data_meta_invalid(xdp); xdp->txq = &txq; act = bpf_prog_run_xdp(xdp_prog, xdp); -- cgit v1.2.3 From 248e00ac47d64e153b9c50f45aad73cd61894a73 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 8 Jun 2020 17:22:01 +0100 Subject: bpf: cgroup: Allow multi-attach program to replace itself When using BPF_PROG_ATTACH to attach a program to a cgroup in BPF_F_ALLOW_MULTI mode, it is not possible to replace a program with itself. This is because the check for duplicate programs doesn't take the replacement program into account. Replacing a program with itself might seem weird, but it has some uses: first, it allows resetting the associated cgroup storage. Second, it makes the API consistent with the non-ALLOW_MULTI usage, where it is possible to replace a program with itself. Third, it aligns BPF_PROG_ATTACH with bpf_link, where replacing itself is also supported. Sice this code has been refactored a few times this change will only apply to v5.7 and later. Adjustments could be made to commit 1020c1f24a94 ("bpf: Simplify __cgroup_bpf_attach") and commit d7bf2c10af05 ("bpf: allocate cgroup storage entries on attaching bpf programs") as well as commit 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200608162202.94002-1-lmb@cloudflare.com --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fdf7836750a3..4d76f16524cc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -378,7 +378,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, } list_for_each_entry(pl, progs, node) { - if (prog && pl->prog == prog) + if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); if (link && pl->link == link) -- cgit v1.2.3 From 281920b7e0b31e0a7706433ff58e7d52ac97c327 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Jun 2020 15:31:46 +0200 Subject: bpf: Devmap adjust uapi for attach bpf program V2: - Defer changing BPF-syscall to start at file-descriptor 1 - Use {} to zero initialise struct. The recent commit fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry"), introduced ability to attach (and run) a separate XDP bpf_prog for each devmap entry. A bpf_prog is added via a file-descriptor. As zero were a valid FD, not using the feature requires using value minus-1. The UAPI is extended via tail-extending struct bpf_devmap_val and using map->value_size to determine the feature set. This will break older userspace applications not using the bpf_prog feature. Consider an old userspace app that is compiled against newer kernel uapi/bpf.h, it will not know that it need to initialise the member bpf_prog.fd to minus-1. Thus, users will be forced to update source code to get program running on newer kernels. This patch remove the minus-1 checks, and have zero mean feature isn't used. Followup patches either for kernel or libbpf should handle and avoid returning file-descriptor zero in the first place. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159170950687.2102545.7235914718298050113.stgit@firesoul --- kernel/bpf/devmap.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index bfdff2faf5cb..0cbb72cdaf63 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,15 +60,6 @@ struct xdp_dev_bulk_queue { unsigned int count; }; -/* DEVMAP values */ -struct bpf_devmap_val { - u32 ifindex; /* device index */ - union { - int fd; /* prog fd on map write */ - u32 id; /* prog id on map read */ - } bpf_prog; -}; - struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; @@ -619,7 +610,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; - if (val->bpf_prog.fd >= 0) { + if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) @@ -653,8 +644,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -670,7 +661,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ - if (val.bpf_prog.fd != -1) + if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); @@ -700,8 +691,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; -- cgit v1.2.3 From 3021e69219e2f3df6d01243000db32d1325cdd0d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 10 Jun 2020 18:41:28 -0700 Subject: kcov: check kcov_softirq in kcov_remote_stop() kcov_remote_stop() should check that the corresponding kcov_remote_start() actually found the specified remote handle and started collecting coverage. This is done by checking the per thread kcov_softirq flag. A particular failure scenario where this was observed involved a softirq with a remote coverage collection section coming between check_kcov_mode() and the access to t->kcov_area in __sanitizer_cov_trace_pc(). In that softirq kcov_remote_start() bailed out after kcov_remote_find() check, but the matching kcov_remote_stop() didn't check if kcov_remote_start() succeeded, and overwrote per thread kcov parameters with invalid (zero) values. Fixes: 5ff3b30ab57d ("kcov: collect coverage from interrupts") Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Marco Elver Cc: Tetsuo Handa Link: http://lkml.kernel.org/r/fcd1cd16eac1d2c01a66befd8ea4afc6f8d09833.1591576806.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 55c5d883a93e..6afae0bcbac4 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -427,7 +427,8 @@ void kcov_task_exit(struct task_struct *t) * WARN_ON(!kcov->remote && kcov->t != t); * * For KCOV_REMOTE_ENABLE devices, the exiting task is either: - * 2. A remote task between kcov_remote_start() and kcov_remote_stop(). + * + * 1. A remote task between kcov_remote_start() and kcov_remote_stop(). * In this case we should print a warning right away, since a task * shouldn't be exiting when it's in a kcov coverage collection * section. Here t points to the task that is collecting remote @@ -437,7 +438,7 @@ void kcov_task_exit(struct task_struct *t) * WARN_ON(kcov->remote && kcov->t != t); * * 2. The task that created kcov exiting without calling KCOV_DISABLE, - * and then again we can make sure that t->kcov->t == t: + * and then again we make sure that t->kcov->t == t: * WARN_ON(kcov->remote && kcov->t != t); * * By combining all three checks into one we get: @@ -764,7 +765,7 @@ static const struct file_operations kcov_fops = { * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to - * be collected via __sanitizer_cov_trace_pc() + * be collected via __sanitizer_cov_trace_pc(). * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ @@ -972,16 +973,25 @@ void kcov_remote_stop(void) local_irq_restore(flags); return; } - kcov = t->kcov; - area = t->kcov_area; - size = t->kcov_size; - sequence = t->kcov_sequence; - + /* + * When in softirq, check if the corresponding kcov_remote_start() + * actually found the remote handle and started collecting coverage. + */ + if (in_serving_softirq() && !t->kcov_softirq) { + local_irq_restore(flags); + return; + } + /* Make sure that kcov_softirq is only set when in softirq. */ if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { local_irq_restore(flags); return; } + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; + kcov_stop(t); if (in_serving_softirq()) { t->kcov_softirq = 0; -- cgit v1.2.3 From 9bf5b9eb232b34738800868e30bea3bad4a6a1ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jun 2020 18:41:59 -0700 Subject: kernel: move use_mm/unuse_mm to kthread.c Patch series "improve use_mm / unuse_mm", v2. This series improves the use_mm / unuse_mm interface by better documenting the assumptions, and my taking the set_fs manipulations spread over the callers into the core API. This patch (of 3): Use the proper API instead. Link: http://lkml.kernel.org/r/20200404094101.672954-1-hch@lst.de These helpers are only for use with kernel threads, and I will tie them more into the kthread infrastructure going forward. Also move the prototypes to kthread.h - mmu_context.h was a little weird to start with as it otherwise contains very low-level MM bits. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Tested-by: Jens Axboe Reviewed-by: Jens Axboe Acked-by: Felix Kuehling Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Zhenyu Wang Cc: Zhi Wang Cc: Greg Kroah-Hartman Link: http://lkml.kernel.org/r/20200404094101.672954-1-hch@lst.de Link: http://lkml.kernel.org/r/20200416053158.586887-1-hch@lst.de Link: http://lkml.kernel.org/r/20200404094101.672954-5-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/kthread.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index bfbfa481be3a..ce4610316377 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,13 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. + * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include +#include +#include #include +#include #include #include #include @@ -25,6 +29,7 @@ #include #include + static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; @@ -1203,6 +1208,57 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + active_mm = tsk->active_mm; + if (active_mm != mm) { + mmgrab(mm); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif + + if (active_mm != mm) + mmdrop(active_mm); +} +EXPORT_SYMBOL_GPL(use_mm); + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + sync_mm_rss(mm); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} +EXPORT_SYMBOL_GPL(unuse_mm); + #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread -- cgit v1.2.3 From f5678e7f2ac31c270334b936352f0ef2fe7dd2b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jun 2020 18:42:06 -0700 Subject: kernel: better document the use_mm/unuse_mm API contract Switch the function documentation to kerneldoc comments, and add WARN_ON_ONCE asserts that the calling thread is a kernel thread and does not have ->mm set (or has ->mm set in the case of unuse_mm). Also give the functions a kthread_ prefix to better document the use case. [hch@lst.de: fix a comment typo, cover the newly merged use_mm/unuse_mm caller in vfio] Link: http://lkml.kernel.org/r/20200416053158.586887-3-hch@lst.de [sfr@canb.auug.org.au: powerpc/vas: fix up for {un}use_mm() rename] Link: http://lkml.kernel.org/r/20200422163935.5aa93ba5@canb.auug.org.au Signed-off-by: Christoph Hellwig Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Tested-by: Jens Axboe Reviewed-by: Jens Axboe Acked-by: Felix Kuehling Acked-by: Greg Kroah-Hartman [usb] Acked-by: Haren Myneni Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Zhenyu Wang Cc: Zhi Wang Link: http://lkml.kernel.org/r/20200404094101.672954-6-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/kthread.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index ce4610316377..8ed4b4fbec7c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1208,18 +1208,18 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); -/* - * use_mm - * Makes the calling kernel thread take on the specified - * mm context. - * (Note: this routine is intended to be called only - * from a kernel thread context) +/** + * kthread_use_mm - make the calling kthread operate on an address space + * @mm: address space to operate on */ -void use_mm(struct mm_struct *mm) +void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; + WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); + WARN_ON_ONCE(tsk->mm); + task_lock(tsk); active_mm = tsk->active_mm; if (active_mm != mm) { @@ -1236,20 +1236,19 @@ void use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); } -EXPORT_SYMBOL_GPL(use_mm); +EXPORT_SYMBOL_GPL(kthread_use_mm); -/* - * unuse_mm - * Reverses the effect of use_mm, i.e. releases the - * specified mm context which was earlier taken on - * by the calling kernel thread - * (Note: this routine is intended to be called only - * from a kernel thread context) +/** + * kthread_unuse_mm - reverse the effect of kthread_use_mm() + * @mm: address space to operate on */ -void unuse_mm(struct mm_struct *mm) +void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; + WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); + WARN_ON_ONCE(!tsk->mm); + task_lock(tsk); sync_mm_rss(mm); tsk->mm = NULL; @@ -1257,7 +1256,7 @@ void unuse_mm(struct mm_struct *mm) enter_lazy_tlb(mm, tsk); task_unlock(tsk); } -EXPORT_SYMBOL_GPL(unuse_mm); +EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** -- cgit v1.2.3 From 37c54f9bd48663f7657a9178fe08c47e4f5b537b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jun 2020 18:42:10 -0700 Subject: kernel: set USER_DS in kthread_use_mm Some architectures like arm64 and s390 require USER_DS to be set for kernel threads to access user address space, which is the whole purpose of kthread_use_mm, but other like x86 don't. That has lead to a huge mess where some callers are fixed up once they are tested on said architectures, while others linger around and yet other like io_uring try to do "clever" optimizations for what usually is just a trivial asignment to a member in the thread_struct for most architectures. Make kthread_use_mm set USER_DS, and kthread_unuse_mm restore to the previous value instead. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Tested-by: Jens Axboe Reviewed-by: Jens Axboe Acked-by: Michael S. Tsirkin Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Felix Kuehling Cc: Jason Wang Cc: Zhenyu Wang Cc: Zhi Wang Cc: Greg Kroah-Hartman Link: http://lkml.kernel.org/r/20200404094101.672954-7-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/kthread.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 8ed4b4fbec7c..86357cd38eb2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,6 +52,7 @@ struct kthread { unsigned long flags; unsigned int cpu; void *data; + mm_segment_t oldfs; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP @@ -1235,6 +1236,9 @@ void kthread_use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); + + to_kthread(tsk)->oldfs = get_fs(); + set_fs(USER_DS); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1249,6 +1253,8 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); + set_fs(to_kthread(tsk)->oldfs); + task_lock(tsk); sync_mm_rss(mm); tsk->mm = NULL; -- cgit v1.2.3 From 2a9e5ded9543436620a7fbc9329ddcc32bf97bc7 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 20 May 2020 12:22:33 +0200 Subject: printk/kdb: Redirect printk messages into kdb in any context kdb has to get messages on consoles even when the system is stopped. It uses kdb_printf() internally and calls console drivers on its own. It uses a hack to reuse an existing code. It sets "kdb_trap_printk" global variable to redirect even the normal printk() into the kdb_printf() variant. The variable "kdb_trap_printk" is checked in printk_default() and it is ignored when printk is redirected to printk_safe in NMI context. Solve this by moving the check into printk_func(). It is obvious that it is not fully safe. But it does not make things worse. The console drivers are already called in this context by db_printf() direct calls. Reported-by: Sumit Garg Tested-by: Sumit Garg Reviewed-by: Daniel Thompson Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200520102233.GC3464@linux-b0ei --- kernel/printk/printk.c | 14 +------------- kernel/printk/printk_safe.c | 7 +++++++ 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9fdd6a42ad6a..2167bb528dd3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -2047,18 +2046,7 @@ EXPORT_SYMBOL(vprintk); int vprintk_default(const char *fmt, va_list args) { - int r; - -#ifdef CONFIG_KGDB_KDB - /* Allow to pass printk() to kdb but avoid a recursion. */ - if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { - r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); - return r; - } -#endif - r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); - - return r; + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d9a659a686f3..7ccb821d0bfe 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -359,6 +360,12 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { +#ifdef CONFIG_KGDB_KDB + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) + return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +#endif + /* * Try to use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. -- cgit v1.2.3 From 0372007f5a79d61d3cb48a507717b9afb5d6addd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 11:05:22 +0100 Subject: context_tracking: Ensure that the critical path cannot be instrumented context tracking lacks a few protection mechanisms against instrumentation: - While the core functions are marked NOKPROBE they lack protection against function tracing which is required as the function entry/exit points can be utilized by BPF. - static functions invoked from the protected functions need to be marked as well as they can be instrumented otherwise. - using plain inline allows the compiler to emit traceable and probable functions. Fix this by marking the functions noinstr and converting the plain inlines to __always_inline. The NOKPROBE_SYMBOL() annotations are removed as the .noinstr.text section is already excluded from being probed. Cures the following objtool warnings: vmlinux.o: warning: objtool: enter_from_user_mode()+0x34: call to __context_tracking_exit() leaves .noinstr.text section vmlinux.o: warning: objtool: prepare_exit_to_usermode()+0x29: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_return_slowpath()+0x29: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_syscall_64()+0x7f: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_int80_syscall_32()+0x3d: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_fast_syscall_32()+0x9c: call to __context_tracking_enter() leaves .noinstr.text section and generates new ones... Signed-off-by: Thomas Gleixner Reviewed-by: Masami Hiramatsu Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.811520478@linutronix.de --- kernel/context_tracking.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index ce430885c26c..36a98c48aedc 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); -static bool context_tracking_recursion_enter(void) +static noinstr bool context_tracking_recursion_enter(void) { int recursion; @@ -45,7 +45,7 @@ static bool context_tracking_recursion_enter(void) return false; } -static void context_tracking_recursion_exit(void) +static __always_inline void context_tracking_recursion_exit(void) { __this_cpu_dec(context_tracking.recursion); } @@ -59,7 +59,7 @@ static void context_tracking_recursion_exit(void) * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void __context_tracking_enter(enum ctx_state state) +void noinstr __context_tracking_enter(enum ctx_state state) { /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); @@ -77,8 +77,10 @@ void __context_tracking_enter(enum ctx_state state) * on the tick. */ if (state == CONTEXT_USER) { + instrumentation_begin(); trace_user_enter(0); vtime_user_enter(current); + instrumentation_end(); } rcu_user_enter(); } @@ -99,7 +101,6 @@ void __context_tracking_enter(enum ctx_state state) } context_tracking_recursion_exit(); } -NOKPROBE_SYMBOL(__context_tracking_enter); EXPORT_SYMBOL_GPL(__context_tracking_enter); void context_tracking_enter(enum ctx_state state) @@ -142,7 +143,7 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void __context_tracking_exit(enum ctx_state state) +void noinstr __context_tracking_exit(enum ctx_state state) { if (!context_tracking_recursion_enter()) return; @@ -155,15 +156,16 @@ void __context_tracking_exit(enum ctx_state state) */ rcu_user_exit(); if (state == CONTEXT_USER) { + instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); + instrumentation_end(); } } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); } -NOKPROBE_SYMBOL(__context_tracking_exit); EXPORT_SYMBOL_GPL(__context_tracking_exit); void context_tracking_exit(enum ctx_state state) -- cgit v1.2.3 From 5916d5f9b3347344a3d96ba6b54cf8e290eba96a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Mar 2020 13:49:51 +0100 Subject: bug: Annotate WARN/BUG/stackfail as noinstr safe Warnings, bugs and stack protection fails from noinstr sections, e.g. low level and early entry code, are likely to be fatal. Mark them as "safe" to be invoked from noinstr protected code to avoid annotating all usage sites. Getting the information out is important. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.376598577@linutronix.de --- kernel/panic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 85568bbfb12b..e2157ca387c8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -680,10 +680,12 @@ device_initcall(register_warn_debugfs); * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ -__visible void __stack_chk_fail(void) +__visible noinstr void __stack_chk_fail(void) { + instrumentation_begin(); panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); -- cgit v1.2.3 From 865d3a9afe7eddf320e7f61a442864d6efe27505 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 21:22:36 +0200 Subject: x86/mce: Address objtools noinstr complaints Mark the relevant functions noinstr, use the plain non-instrumented MSR accessors. The only odd part is the instrumentation_begin()/end() pair around the indirect machine_check_vector() call as objtool can't figure that out. The possible invoked functions are annotated correctly. Also use notrace variant of nmi_enter/exit(). If MCEs happen then hardware latency tracing is the least of the worries. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.476734898@linutronix.de --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9ebaab13339d..d20d489841c8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -953,7 +953,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); * but without the sequence counter protect. This internal function * is called just when timekeeping lock is already held. */ -time64_t __ktime_get_real_seconds(void) +noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; -- cgit v1.2.3 From 8a6bc4787f05d087fda8e11ead225c8830250703 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:21 +0200 Subject: genirq: Provide irq_enter/exit_rcu() irq_enter()/exit() currently include RCU handling. To properly separate the RCU handling code, provide variants which contain only the non-RCU related functionality. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.567023613@linutronix.de --- kernel/softirq.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index a47c6dd57452..beb8e3a66c7c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -339,12 +339,11 @@ asmlinkage __visible void do_softirq(void) local_irq_restore(flags); } -/* - * Enter an interrupt context. +/** + * irq_enter_rcu - Enter an interrupt context with RCU watching */ -void irq_enter(void) +void irq_enter_rcu(void) { - rcu_irq_enter(); if (is_idle_task(current) && !in_interrupt()) { /* * Prevent raise_softirq from needlessly waking up ksoftirqd @@ -354,10 +353,18 @@ void irq_enter(void) tick_irq_enter(); _local_bh_enable(); } - __irq_enter(); } +/** + * irq_enter - Enter an interrupt context including RCU update + */ +void irq_enter(void) +{ + rcu_irq_enter(); + irq_enter_rcu(); +} + static inline void invoke_softirq(void) { if (ksoftirqd_running(local_softirq_pending())) @@ -397,10 +404,12 @@ static inline void tick_irq_exit(void) #endif } -/* - * Exit an interrupt context. Process softirqs if needed and possible: +/** + * irq_exit_rcu() - Exit an interrupt context without updating RCU + * + * Also processes softirqs if needed and possible. */ -void irq_exit(void) +void irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); @@ -413,6 +422,16 @@ void irq_exit(void) invoke_softirq(); tick_irq_exit(); +} + +/** + * irq_exit - Exit an interrupt context, update RCU and lockdep + * + * Also processes softirqs if needed and possible. + */ +void irq_exit(void) +{ + irq_exit_rcu(); rcu_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); -- cgit v1.2.3 From 59bc300b712998d10254ee20e24f2e7ec09c560a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:39 +0200 Subject: x86/entry: Clarify irq_{enter,exit}_rcu() Because: irq_enter_rcu() includes lockdep_hardirq_enter() irq_exit_rcu() does *NOT* include lockdep_hardirq_exit() Which resulted in two 'stray' lockdep_hardirq_exit() calls in idtentry.h, and me spending a long time trying to find the matching enter calls. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.359433429@infradead.org --- kernel/softirq.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index beb8e3a66c7c..c4201b7f42b1 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -404,12 +404,7 @@ static inline void tick_irq_exit(void) #endif } -/** - * irq_exit_rcu() - Exit an interrupt context without updating RCU - * - * Also processes softirqs if needed and possible. - */ -void irq_exit_rcu(void) +static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); @@ -424,6 +419,18 @@ void irq_exit_rcu(void) tick_irq_exit(); } +/** + * irq_exit_rcu() - Exit an interrupt context without updating RCU + * + * Also processes softirqs if needed and possible. + */ +void irq_exit_rcu(void) +{ + __irq_exit_rcu(); + /* must be last! */ + lockdep_hardirq_exit(); +} + /** * irq_exit - Exit an interrupt context, update RCU and lockdep * @@ -431,7 +438,7 @@ void irq_exit_rcu(void) */ void irq_exit(void) { - irq_exit_rcu(); + __irq_exit_rcu(); rcu_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); -- cgit v1.2.3 From bf2b3008440072068580c609d79a079656af0588 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:40 +0200 Subject: x86/entry: Rename trace_hardirqs_off_prepare() The typical pattern for trace_hardirqs_off_prepare() is: ENTRY lockdep_hardirqs_off(); // because hardware ... do entry magic instrumentation_begin(); trace_hardirqs_off_prepare(); ... do actual work trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); ... do exit magic lockdep_hardirqs_on(); which shows that it's named wrong, rename it to trace_hardirqs_off_finish(), as it concludes the hardirq_off transition. Also, given that the above is the only correct order, make the traditional all-in-one trace_hardirqs_off() follow suit. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.415774872@infradead.org --- kernel/trace/trace_preemptirq.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index fb0691b8a88d..f10073e62603 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -58,7 +58,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on); * and lockdep uses a staged approach which splits the lockdep hardirq * tracking into a RCU on and a RCU off section. */ -void trace_hardirqs_off_prepare(void) +void trace_hardirqs_off_finish(void) { if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); @@ -68,19 +68,19 @@ void trace_hardirqs_off_prepare(void) } } -EXPORT_SYMBOL(trace_hardirqs_off_prepare); -NOKPROBE_SYMBOL(trace_hardirqs_off_prepare); +EXPORT_SYMBOL(trace_hardirqs_off_finish); +NOKPROBE_SYMBOL(trace_hardirqs_off_finish); void trace_hardirqs_off(void) { + lockdep_hardirqs_off(CALLER_ADDR0); + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); } - - lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); NOKPROBE_SYMBOL(trace_hardirqs_off); -- cgit v1.2.3 From 6eebad1ad303db360ebe3e51c2b9656c3d407157 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:21 +0200 Subject: lockdep: __always_inline more for noinstr vmlinux.o: warning: objtool: debug_locks_off()+0xd: call to __debug_locks_off() leaves .noinstr.text section vmlinux.o: warning: objtool: match_held_lock()+0x6a: call to look_up_lock_class.isra.0() leaves .noinstr.text section vmlinux.o: warning: objtool: lock_is_held_type()+0x90: call to lockdep_recursion_finish() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114052.185201076@infradead.org --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 38cce34d03dc..29a8de4c50b9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,7 +393,7 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -static inline void lockdep_recursion_finish(void) +static __always_inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) current->lockdep_recursion = 0; @@ -801,7 +801,7 @@ static int count_matching_names(struct lock_class *new_class) } /* used from NMI context -- must be lockless */ -static inline struct lock_class * +static __always_inline struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; -- cgit v1.2.3 From 75d75b7a4d5489cc6a5e91ace306f6c13f376f33 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 21 May 2020 16:20:39 +0200 Subject: kcsan: Support distinguishing volatile accesses In the kernel, the "volatile" keyword is used in various concurrent contexts, whether in low-level synchronization primitives or for legacy reasons. If supported by the compiler, it will be assumed that aligned volatile accesses up to sizeof(long long) (matching compiletime_assert_rwonce_type()) are atomic. Recent versions of Clang [1] (GCC tentative [2]) can instrument volatile accesses differently. Add the option (required) to enable the instrumentation, and provide the necessary runtime functions. None of the updated compilers are widely available yet (Clang 11 will be the first release to support the feature). [1] https://github.com/llvm/llvm-project/commit/5a2c31116f412c3b6888be361137efd705e05814 [2] https://gcc.gnu.org/pipermail/gcc-patches/2020-April/544452.html This change allows removing of any explicit checks in primitives such as READ_ONCE() and WRITE_ONCE(). [ bp: Massage commit message a bit. ] Signed-off-by: Marco Elver Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200521142047.169334-4-elver@google.com --- kernel/kcsan/core.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index a73a66cf79df..15f67949d11e 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -789,6 +789,49 @@ void __tsan_write_range(void *ptr, size_t size) } EXPORT_SYMBOL(__tsan_write_range); +/* + * Use of explicit volatile is generally disallowed [1], however, volatile is + * still used in various concurrent context, whether in low-level + * synchronization primitives or for legacy reasons. + * [1] https://lwn.net/Articles/233479/ + * + * We only consider volatile accesses atomic if they are aligned and would pass + * the size-check of compiletime_assert_rwonce_type(). + */ +#define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \ + void __tsan_volatile_read##size(void *ptr) \ + { \ + const bool is_atomic = size <= sizeof(long long) && \ + IS_ALIGNED((unsigned long)ptr, size); \ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \ + return; \ + check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \ + } \ + EXPORT_SYMBOL(__tsan_volatile_read##size); \ + void __tsan_unaligned_volatile_read##size(void *ptr) \ + __alias(__tsan_volatile_read##size); \ + EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \ + void __tsan_volatile_write##size(void *ptr) \ + { \ + const bool is_atomic = size <= sizeof(long long) && \ + IS_ALIGNED((unsigned long)ptr, size); \ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \ + return; \ + check_access(ptr, size, \ + KCSAN_ACCESS_WRITE | \ + (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \ + } \ + EXPORT_SYMBOL(__tsan_volatile_write##size); \ + void __tsan_unaligned_volatile_write##size(void *ptr) \ + __alias(__tsan_volatile_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_volatile_write##size) + +DEFINE_TSAN_VOLATILE_READ_WRITE(1); +DEFINE_TSAN_VOLATILE_READ_WRITE(2); +DEFINE_TSAN_VOLATILE_READ_WRITE(4); +DEFINE_TSAN_VOLATILE_READ_WRITE(8); +DEFINE_TSAN_VOLATILE_READ_WRITE(16); + /* * The below are not required by KCSAN, but can still be emitted by the * compiler. -- cgit v1.2.3 From 29fcb05bbf1a7008900bb9bee347bdbfc7171036 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jun 2020 17:21:15 -0700 Subject: bpf: Undo internal BPF_PROBE_MEM in BPF insns dump BPF_PROBE_MEM is kernel-internal implmementation details. When dumping BPF instructions to user-space, it needs to be replaced back with BPF_MEM mode. Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200613002115.1632142-1-andriin@fb.com --- kernel/bpf/syscall.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d530b1d5683..e9a3ebc00e08 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3158,6 +3158,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) struct bpf_insn *insns; u32 off, type; u64 imm; + u8 code; int i; insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), @@ -3166,21 +3167,27 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) return insns; for (i = 0; i < prog->len; i++) { - if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + code = insns[i].code; + + if (code == (BPF_JMP | BPF_TAIL_CALL)) { insns[i].code = BPF_JMP | BPF_CALL; insns[i].imm = BPF_FUNC_tail_call; /* fall-through */ } - if (insns[i].code == (BPF_JMP | BPF_CALL) || - insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { - if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + if (code == (BPF_JMP | BPF_CALL) || + code == (BPF_JMP | BPF_CALL_ARGS)) { + if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok()) insns[i].imm = 0; continue; } + if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { + insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; + continue; + } - if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; -- cgit v1.2.3 From a7f7f6248d9740d710fd6bd190293fe5e16410ac Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 14 Jun 2020 01:50:22 +0900 Subject: treewide: replace '---help---' in Kconfig files with 'help' Since commit 84af7a6194e4 ("checkpatch: kconfig: prefer 'help' over '---help---'"), the number of '---help---' has been gradually decreasing, but there are still more than 2400 instances. This commit finishes the conversion. While I touched the lines, I also fixed the indentation. There are a variety of indentation styles found. a) 4 spaces + '---help---' b) 7 spaces + '---help---' c) 8 spaces + '---help---' d) 1 space + 1 tab + '---help---' e) 1 tab + '---help---' (correct indentation) f) 1 tab + 1 space + '---help---' g) 1 tab + 2 spaces + '---help---' In order to convert all of them to 1 tab + 'help', I ran the following commend: $ find . -name 'Kconfig*' | xargs sed -i 's/^[[:space:]]*---help---/\thelp/' Signed-off-by: Masahiro Yamada --- kernel/gcov/Kconfig | 4 ++-- kernel/irq/Kconfig | 4 ++-- kernel/power/Kconfig | 26 +++++++++++++------------- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index feaad597b3f4..3110c77230c7 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -6,7 +6,7 @@ config GCOV_KERNEL depends on DEBUG_FS select CONSTRUCTORS if !UML default n - ---help--- + help This option enables gcov-based code profiling (e.g. for code coverage measurements). @@ -42,7 +42,7 @@ config GCOV_PROFILE_ALL depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n - ---help--- + help This options activates profiling for the entire kernel. If unsure, say N. diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d63c324895ea..20512252ecc9 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -118,7 +118,7 @@ config IRQ_FORCED_THREADING config SPARSE_IRQ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ - ---help--- + help Sparse irq numbering is useful for distro kernels that want to define a high CONFIG_NR_CPUS value but still want to have @@ -134,7 +134,7 @@ config GENERIC_IRQ_DEBUGFS depends on DEBUG_FS select GENERIC_IRQ_INJECTION default n - ---help--- + help Exposes internal state information through debugfs. Mostly for developers and debugging of hard to diagnose interrupt problems. diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4d0e6e815a2b..a7320f07689d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -3,7 +3,7 @@ config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE default y - ---help--- + help Allow the system to enter sleep states in which main memory is powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). @@ -42,7 +42,7 @@ config HIBERNATION select LZO_COMPRESS select LZO_DECOMPRESS select CRC32 - ---help--- + help Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the system and powers it off; and restores that checkpoint on reboot. @@ -84,7 +84,7 @@ config HIBERNATION_SNAPSHOT_DEV bool "Userspace snapshot device" depends on HIBERNATION default y - ---help--- + help Device used by the uswsusp tools. Say N if no snapshotting from userspace is needed, this also @@ -96,7 +96,7 @@ config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION default "" - ---help--- + help The default resume partition is the partition that the suspend- to-disk implementation will look for a suspended disk image. @@ -131,7 +131,7 @@ config PM_SLEEP_SMP_NONZERO_CPU def_bool y depends on PM_SLEEP_SMP depends on ARCH_SUSPEND_NONZERO_CPU - ---help--- + help If an arch can suspend (for suspend, hibernate, kexec, etc) on a non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This will allow nohz_full mask to include CPU0. @@ -140,7 +140,7 @@ config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP default n - ---help--- + help Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. @@ -148,7 +148,7 @@ config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP default n - ---help--- + help Allow user space to create, activate and deactivate wakeup source objects with the help of a sysfs-based interface. @@ -165,7 +165,7 @@ config PM_WAKELOCKS_GC config PM bool "Device power management core functionality" - ---help--- + help Enable functionality allowing I/O devices to be put into energy-saving (low power) states, for example after a specified period of inactivity (autosuspended), and woken up in response to a hardware-generated @@ -179,7 +179,7 @@ config PM config PM_DEBUG bool "Power Management Debug Support" depends on PM - ---help--- + help This option enables various debugging support in the Power Management code. This is helpful when debugging and reporting PM bugs, like suspend support. @@ -187,7 +187,7 @@ config PM_DEBUG config PM_ADVANCED_DEBUG bool "Extra PM attributes in sysfs for low-level debugging/testing" depends on PM_DEBUG - ---help--- + help Add extra sysfs attributes allowing one to access some Power Management fields of device objects from user space. If you are not a kernel developer interested in debugging/testing Power Management, say "no". @@ -195,7 +195,7 @@ config PM_ADVANCED_DEBUG config PM_TEST_SUSPEND bool "Test suspend/resume and wakealarm during bootup" depends on SUSPEND && PM_DEBUG && RTC_CLASS=y - ---help--- + help This option will let you suspend your machine during bootup, and make it wake up a few seconds later using an RTC wakeup alarm. Enable this with a kernel parameter like "test_suspend=mem". @@ -210,7 +210,7 @@ config PM_SLEEP_DEBUG config DPM_WATCHDOG bool "Device suspend/resume watchdog" depends on PM_DEBUG && PSTORE && EXPERT - ---help--- + help Sets up a watchdog timer to capture drivers that are locked up attempting to suspend/resume a device. A detected lockup causes system panic with message @@ -243,7 +243,7 @@ config PM_TRACE_RTC depends on PM_SLEEP_DEBUG depends on X86 select PM_TRACE - ---help--- + help This enables some cheesy code to save the last PM event point in the RTC across reboots, so that you can debug a machine that just hangs during suspend (or more commonly, during resume). -- cgit v1.2.3 From 39030e1351aa1aa7443bb2da24426573077c83da Mon Sep 17 00:00:00 2001 From: Thomas Cedeno Date: Tue, 9 Jun 2020 10:22:13 -0700 Subject: security: Add LSM hooks to set*gid syscalls The SafeSetID LSM uses the security_task_fix_setuid hook to filter set*uid() syscalls according to its configured security policy. In preparation for adding analagous support in the LSM for set*gid() syscalls, we add the requisite hook here. Tested by putting print statements in the security_task_fix_setgid hook and seeing them get hit during kernel boot. Signed-off-by: Thomas Cedeno Signed-off-by: Micah Morton --- kernel/sys.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..f5c06c48d585 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -393,6 +393,10 @@ long __sys_setregid(gid_t rgid, gid_t egid) new->sgid = new->egid; new->fsgid = new->egid; + retval = security_task_fix_setgid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -435,6 +439,10 @@ long __sys_setgid(gid_t gid) else goto error; + retval = security_task_fix_setgid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -756,6 +764,10 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) new->sgid = ksgid; new->fsgid = new->egid; + retval = security_task_fix_setgid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -862,7 +874,8 @@ long __sys_setfsgid(gid_t gid) ns_capable(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; - goto change_okay; + if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) + goto change_okay; } } -- cgit v1.2.3 From dbed452a078d56bc7f1abecc3edd6a75e8e4484e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 11 Jun 2020 00:25:57 -0700 Subject: dma-pool: decouple DMA_REMAP from DMA_COHERENT_POOL DMA_REMAP is an unnecessary requirement for AMD SEV, which requires DMA_COHERENT_POOL, so avoid selecting it when it is otherwise unnecessary. The only other requirement for DMA coherent pools is DMA_DIRECT_REMAP, so ensure that properly selects the config option when needed. Fixes: 82fef0ad811f ("x86/mm: unencrypted non-blocking DMA allocations use coherent pools") Reported-by: Alex Xu (Hello71) Signed-off-by: David Rientjes Tested-by: Alex Xu (Hello71) Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index d006668c0027..a0ce3c1494fd 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -73,18 +73,18 @@ config SWIOTLB config DMA_NONCOHERENT_MMAP bool +config DMA_COHERENT_POOL + bool + config DMA_REMAP + bool depends on MMU select GENERIC_ALLOCATOR select DMA_NONCOHERENT_MMAP - bool - -config DMA_COHERENT_POOL - bool - select DMA_REMAP config DMA_DIRECT_REMAP bool + select DMA_REMAP select DMA_COHERENT_POOL config DMA_CMA -- cgit v1.2.3 From e17d43b93e544f5016c0251d2074c15568d5d963 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:08 +0300 Subject: perf: Add perf text poke event Record (single instruction) changes to the kernel text (i.e. self-modifying code) in order to support tracers like Intel PT and ARM CoreSight. A copy of the running kernel code is needed as a reference point (e.g. from /proc/kcore). The text poke event records the old bytes and the new bytes so that the event can be processed forwards or backwards. The basic problem is recording the modified instruction in an unambiguous manner given SMP instruction cache (in)coherence. That is, when modifying an instruction concurrently any solution with one or multiple timestamps is not sufficient: CPU0 CPU1 0 1 write insn A 2 execute insn A 3 sync-I$ 4 Due to I$, CPU1 might execute either the old or new A. No matter where we record tracepoints on CPU0, one simply cannot tell what CPU1 will have observed, except that at 0 it must be the old one and at 4 it must be the new one. To solve this, take inspiration from x86 text poking, which has to solve this exact problem due to variable length instruction encoding and I-fetch windows. 1) overwrite the instruction with a breakpoint and sync I$ This guarantees that that code flow will never hit the target instruction anymore, on any CPU (or rather, it will cause an exception). 2) issue the TEXT_POKE event 3) overwrite the breakpoint with the new instruction and sync I$ Now we know that any execution after the TEXT_POKE event will either observe the breakpoint (and hit the exception) or the new instruction. So by guarding the TEXT_POKE event with an exception on either side; we can now tell, without doubt, which instruction another CPU will have observed. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-2-adrian.hunter@intel.com --- kernel/events/core.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..9b8f92500833 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || - attr->context_switch || + attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; @@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { /* -- cgit v1.2.3 From d002b8bc6dbc20e9043e279196cff8795dba05fe Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 28 May 2020 11:00:58 +0300 Subject: kprobes: Add symbols for kprobe insn pages Symbols are needed for tools to describe instruction addresses. Pages allocated for kprobe's purposes need symbols to be created for them. Add such symbols to be visible via /proc/kallsyms. Note: kprobe insn pages are not used if ftrace is configured. To see the effect of this patch, the kernel must be configured with: # CONFIG_FUNCTION_TRACER is not set CONFIG_KPROBES=y and for optimised kprobes: CONFIG_OPTPROBES=y Example on x86: # perf probe __schedule Added new event: probe:__schedule (on __schedule) # cat /proc/kallsyms | grep '\[__builtin__kprobes\]' ffffffffc00d4000 t kprobe_insn_page [__builtin__kprobes] ffffffffc00d6000 t kprobe_optinsn_page [__builtin__kprobes] Note: This patch adds "__builtin__kprobes" as a module name in /proc/kallsyms for symbols for pages allocated for kprobes' purposes, even though "__builtin__kprobes" is not a module. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200528080058.20230-1-adrian.hunter@intel.com --- kernel/kallsyms.c | 37 +++++++++++++++++++++++++++++++++---- kernel/kprobes.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..c6cc293c0e67 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -437,6 +438,7 @@ struct kallsym_iter { loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; + loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -496,11 +498,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { + int ret; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, - &iter->value, &iter->type, - iter->name) < 0 ? 0 : 1; + ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, + &iter->value, &iter->type, + iter->name); + if (ret < 0) { + iter->pos_bpf_end = iter->pos; + return 0; + } + + return 1; +} + +/* + * This uses "__builtin__kprobes" as a module name for symbols for pages + * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a + * module. + */ +static int get_ksymbol_kprobe(struct kallsym_iter *iter) +{ + strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + iter->exported = 0; + return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ @@ -527,6 +551,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; + iter->pos_bpf_end = 0; } } @@ -551,7 +576,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) get_ksymbol_ftrace_mod(iter)) return 1; - return get_ksymbol_bpf(iter); + if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && + get_ksymbol_bpf(iter)) + return 1; + + return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 50cd84f53df0..058c0be3464b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -118,6 +118,7 @@ struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_INSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -290,12 +291,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) return ret; } +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym) +{ + struct kprobe_insn_page *kip; + int ret = -ERANGE; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if ((*symnum)--) + continue; + strlcpy(sym, c->sym, KSYM_NAME_LEN); + *type = 't'; + *value = (unsigned long)kip->insns; + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_OPTINSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, @@ -2197,6 +2220,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) kprobe_remove_area_blacklist(entry, entry + 1); } +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} + +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT + if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) + return 0; +#ifdef CONFIG_OPTPROBES + if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) + return 0; +#endif +#endif + if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) + return 0; + return -ERANGE; +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; -- cgit v1.2.3 From 69e49088692899d25dedfa22f00dfb9761e86ed7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:11 +0300 Subject: kprobes: Add perf ksymbol events for kprobe insn pages Symbols are needed for tools to describe instruction addresses. Pages allocated for kprobe's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200512121922.8997-5-adrian.hunter@intel.com --- kernel/kprobes.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 058c0be3464b..2b58740ca0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -202,6 +207,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); -- cgit v1.2.3 From fc0ea795f53c8d7040fa42471f74fe51d78d0834 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:13 +0300 Subject: ftrace: Add symbols for ftrace trampolines Symbols are needed for tools to describe instruction addresses. Pages allocated for ftrace's purposes need symbols to be created for them. Add such symbols to be visible via /proc/kallsyms. Example on x86 with CONFIG_DYNAMIC_FTRACE=y # echo function > /sys/kernel/debug/tracing/current_tracer # cat /proc/kallsyms | grep '\[__builtin__ftrace\]' ffffffffc0238000 t ftrace_trampoline [__builtin__ftrace] Note: This patch adds "__builtin__ftrace" as a module name in /proc/kallsyms for symbols for pages allocated for ftrace's purposes, even though "__builtin__ftrace" is not a module. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-7-adrian.hunter@intel.com --- kernel/kallsyms.c | 5 ++++ kernel/trace/ftrace.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 80 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index c6cc293c0e67..834bfdc43235 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -482,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +/* + * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace + * purposes. In that case "__builtin__ftrace" is used as a module name, even + * though "__builtin__ftrace" is not a module. + */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c163c3531faf..31675b209db2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2764,6 +2764,38 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } +/* List of trace_ops that have allocated trampolines */ +static LIST_HEAD(ftrace_ops_trampoline_list); + +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); +} + +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_del_rcu(&ops->list); +} + +/* + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is + * not a module. + */ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" + +static void ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && + ops->trampoline) + ftrace_remove_trampoline_from_kallsyms(ops); + + arch_ftrace_trampoline_free(ops); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2934,7 +2966,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks(); free_ops: - arch_ftrace_trampoline_free(ops); + ftrace_trampoline_free(ops); } return 0; @@ -6178,6 +6210,27 @@ struct ftrace_mod_map { unsigned int num_funcs; }; +static int ftrace_get_trampoline_kallsym(unsigned int symnum, + unsigned long *value, char *type, + char *name, char *module_name, + int *exported) +{ + struct ftrace_ops *op; + + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { + if (!op->trampoline || symnum--) + continue; + *value = op->trampoline; + *type = 't'; + strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + *exported = 0; + return 0; + } + + return -ERANGE; +} + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6514,6 +6567,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, { struct ftrace_mod_map *mod_map; struct ftrace_mod_func *mod_func; + int ret; preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { @@ -6540,8 +6594,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, WARN_ON(1); break; } + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); preempt_enable(); - return -ERANGE; + return ret; } #else @@ -6553,6 +6609,18 @@ allocate_ftrace_mod_map(struct module *mod, { return NULL; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, char *module_name, + int *exported) +{ + int ret; + + preempt_disable(); + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} #endif /* CONFIG_MODULES */ struct ftrace_init_func { @@ -6733,7 +6801,12 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { + unsigned long trampoline = ops->trampoline; + arch_ftrace_update_trampoline(ops); + if (ops->trampoline && ops->trampoline != trampoline && + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + ftrace_add_trampoline_to_kallsyms(ops); } void ftrace_init_trace_array(struct trace_array *tr) -- cgit v1.2.3 From dd9ddf466ad7a5d2e247925d81ebb0b878bf3b76 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:14 +0300 Subject: ftrace: Add perf ksymbol events for ftrace trampolines Symbols are needed for tools to describe instruction addresses. Pages allocated for ftrace's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-8-adrian.hunter@intel.com --- kernel/trace/ftrace.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 31675b209db2..2baaf7716537 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2790,8 +2790,13 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) static void ftrace_trampoline_free(struct ftrace_ops *ops) { if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && - ops->trampoline) + ops->trampoline) { + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ ftrace_remove_trampoline_from_kallsyms(ops); + } arch_ftrace_trampoline_free(ops); } @@ -6805,8 +6810,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) arch_ftrace_update_trampoline(ops); if (ops->trampoline && ops->trampoline != trampoline && - (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + } } void ftrace_init_trace_array(struct trace_array *tr) -- cgit v1.2.3 From 548e1f6c76e1eb80ba29edd4286b9b9f2c37f5bf Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:15 +0300 Subject: ftrace: Add perf text poke events for ftrace trampolines Add perf text poke events for ftrace trampolines when created and when freed. There can be 3 text_poke events for ftrace trampolines: 1. NULL -> trampoline By ftrace_update_trampoline() when !ops->trampoline Trampoline created 2. [e.g. on x86] CALL rel32 -> CALL rel32 By arch_ftrace_update_trampoline() when ops->trampoline and ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP [e.g. on x86] via text_poke_bp() which generates text poke events Trampoline-called function target updated 3. trampoline -> NULL By ftrace_trampoline_free() when ops->trampoline and ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP Trampoline freed Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-9-adrian.hunter@intel.com --- kernel/trace/ftrace.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2baaf7716537..d6bba734ab72 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2791,6 +2791,13 @@ static void ftrace_trampoline_free(struct ftrace_ops *ops) { if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && ops->trampoline) { + /* + * Record the text poke event before the ksymbol unregister + * event. + */ + perf_event_text_poke((void *)ops->trampoline, + (void *)ops->trampoline, + ops->trampoline_size, NULL, 0); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, ops->trampoline, ops->trampoline_size, true, FTRACE_TRAMPOLINE_SYM); @@ -6816,6 +6823,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, ops->trampoline, ops->trampoline_size, false, FTRACE_TRAMPOLINE_SYM); + /* + * Record the perf text poke event after the ksymbol register + * event. + */ + perf_event_text_poke((void *)ops->trampoline, NULL, 0, + (void *)ops->trampoline, + ops->trampoline_size); } } -- cgit v1.2.3 From 3dc167ba5729ddd2d8e3fa1841653792c295d3f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 19 May 2020 19:25:06 +0200 Subject: sched/cputime: Improve cputime_adjust() People report that utime and stime from /proc//stat become very wrong when the numbers are big enough, especially if you watch these counters incrementally. Specifically, the current implementation of: stime*rtime/total, results in a saw-tooth function on top of the desired line, where the teeth grow in size the larger the values become. IOW, it has a relative error. The result is that, when watching incrementally as time progresses (for large values), we'll see periods of pure stime or utime increase, irrespective of the actual ratio we're striving for. Replace scale_stime() with a math64.h helper: mul_u64_u64_div_u64() that is far more accurate. This also allows architectures to override the implementation -- for instance they can opt for the old algorithm if this new one turns out to be too expensive for them. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200519172506.GA317395@hirez.programming.kicks-ass.net --- kernel/sched/cputime.c | 46 +--------------------------------------------- 1 file changed, 1 insertion(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ff9435dee1df..5a55d2300452 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -519,50 +519,6 @@ void account_idle_ticks(unsigned long ticks) account_idle_time(cputime); } -/* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static u64 scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) - swap(rtime, stime); - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return scaled; -} - /* * Adjust tick based cputime random precision against scheduler runtime * accounting. @@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, goto update; } - stime = scale_stime(stime, rtime, stime + utime); + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); update: /* -- cgit v1.2.3 From 844eb6458facb09d4871a480d8bda06550927a80 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 3 Jun 2020 10:03:01 +0200 Subject: sched/pelt: Remove redundant cap_scale() definition Besides in PELT cap_scale() is used in the Deadline scheduler class for scale-invariant bandwidth enforcement. Remove the cap_scale() definition in kernel/sched/pelt.c and keep the one in kernel/sched/sched.h. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200603080304.16548-2-dietmar.eggemann@arm.com --- kernel/sched/pelt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b4b1ff96642f..dea5567e4f72 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -83,8 +83,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 -- cgit v1.2.3 From 0900acf2d8273f79432a4ded122ad5a265e85783 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 3 Jun 2020 10:03:02 +0200 Subject: sched/core: Remove redundant 'preempt' param from sched_class->yield_to_task() Commit 6d1cafd8b56e ("sched: Resched proper CPU on yield_to()") moved the code to resched the CPU from yield_to_task_fair() to yield_to() making the preempt parameter in sched_class->yield_to_task() unnecessary. Remove it. No other sched_class implements yield_to_task(). Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200603080304.16548-3-dietmar.eggemann@arm.com --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f360326861e..9c89b0eaf796 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5810,7 +5810,7 @@ again: if (task_running(p_rq, p) || p->state) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); + yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { schedstat_inc(rq->yld_count); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbcb2f71599b..6a4dab2f7c07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7157,7 +7157,7 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d4e94c1e5fe..8d5d06881294 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1748,7 +1748,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); - bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); -- cgit v1.2.3 From e3e76a6a04114ec95b0969cd026e8904c67b431b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 3 Jun 2020 10:03:03 +0200 Subject: sched/idle,stop: Remove .get_rr_interval from sched_class The idle task and stop task sched_classes return 0 in this function. The single call site in sched_rr_get_interval() calls p->sched_class->get_rr_interval() only conditional in case it is defined. Otherwise time_slice=0 will be used. The deadline sched class does not define it. Commit a57beec5d427 ("sched: Make sched_class::get_rr_interval() optional") introduced the default time-slice=0 for sched classes which do not provide this function. So .get_rr_interval for idle and stop sched_class can be removed to shrink the code a little. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200603080304.16548-4-dietmar.eggemann@arm.com --- kernel/sched/idle.c | 7 ------- kernel/sched/stop_task.c | 8 -------- 2 files changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 05deb81bb3e3..8d75ca201484 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -446,11 +446,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) BUG(); } -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_idle(struct rq *rq) { } @@ -479,8 +474,6 @@ const struct sched_class idle_sched_class = { .task_tick = task_tick_idle, - .get_rr_interval = get_rr_interval_idle, - .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4c9e9975684f..3e50a6a8f1e5 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) BUG(); /* how!?, what priority? */ } -static unsigned int -get_rr_interval_stop(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_stop(struct rq *rq) { } @@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = { .task_tick = task_tick_stop, - .get_rr_interval = get_rr_interval_stop, - .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, -- cgit v1.2.3 From 1ca2034ed798aea72a68d3904bd39a6cbfbdf405 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 3 Jun 2020 10:03:04 +0200 Subject: sched/fair: Remove unused 'sd' parameter from scale_rt_capacity() Since commit 8ec59c0f5f49 ("sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity()") it is no longer needed. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200603080304.16548-5-dietmar.eggemann@arm.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6a4dab2f7c07..69da576f7f48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8038,7 +8038,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long max = arch_scale_cpu_capacity(cpu); @@ -8070,7 +8070,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); -- cgit v1.2.3 From 4581bea8b4ec4de353369775dfef921191e393b3 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 27 May 2020 17:39:14 +0100 Subject: sched/debug: Add new tracepoints to track util_est The util_est signals are key elements for EAS task placement and frequency selection. Having tracepoints to track these signals enables load-tracking and schedutil testing and/or debugging by a toolkit. Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/1590597554-370150-1-git-send-email-vincent.donnefort@arm.com --- kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9c89b0eaf796..0208b71bef80 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69da576f7f48..a785a9b262dd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3922,6 +3922,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, enqueued = cfs_rq->avg.util_est.enqueued; enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); } /* @@ -3952,6 +3954,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + trace_sched_util_est_cfs_tp(cfs_rq); + /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4017,6 +4021,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_se_tp(&p->se); } static inline int task_fits_capacity(struct task_struct *p, long capacity) -- cgit v1.2.3 From 461daba06bdcb9c7a3f92b9bbd110e1f7d093ffc Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 28 May 2020 12:54:42 -0700 Subject: psi: eliminate kthread_worker from psi trigger scheduling mechanism Each psi group requires a dedicated kthread_delayed_work and kthread_worker. Since no other work can be performed using psi_group's kthread_worker, the same result can be obtained using a task_struct and a timer directly. This makes psi triggering simpler by removing lists and locks involved with kthread_worker usage and eliminates the need for poll_scheduled atomic use in the hot path. Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200528195442.190116-1-surenb@google.com --- kernel/sched/psi.c | 113 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8f45cdb6463b..e53b711bd643 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,7 +190,6 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ - atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); @@ -199,7 +198,7 @@ static void group_init(struct psi_group *group) memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; group->polling_until = 0; - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } void __init psi_init(void) @@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* - * Schedule polling if it's not already scheduled. It's safe to call even from - * hotpath because even though kthread_queue_delayed_work takes worker->lock - * spinlock that spinlock is never contended due to poll_scheduled atomic - * preventing such competition. - */ +/* Schedule polling if it's not already scheduled. */ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) { - struct kthread_worker *kworker; + struct task_struct *task; - /* Do not reschedule if already scheduled */ - if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + /* + * Do not reschedule if already scheduled. + * Possible race with a timer scheduled after this check but before + * mod_timer below can be tolerated because group->polling_next_update + * will keep updates on schedule. + */ + if (timer_pending(&group->poll_timer)) return; rcu_read_lock(); - kworker = rcu_dereference(group->poll_kworker); + task = rcu_dereference(group->poll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ - if (likely(kworker)) - kthread_queue_delayed_work(kworker, &group->poll_work, delay); - else - atomic_set(&group->poll_scheduled, 0); + if (likely(task)) + mod_timer(&group->poll_timer, jiffies + delay); rcu_read_unlock(); } -static void psi_poll_work(struct kthread_work *work) +static void psi_poll_work(struct psi_group *group) { - struct kthread_delayed_work *dwork; - struct psi_group *group; u32 changed_states; u64 now; - dwork = container_of(work, struct kthread_delayed_work, work); - group = container_of(dwork, struct psi_group, poll_work); - - atomic_set(&group->poll_scheduled, 0); - mutex_lock(&group->trigger_lock); now = sched_clock(); @@ -623,6 +613,35 @@ out: mutex_unlock(&group->trigger_lock); } +static int psi_poll_worker(void *data) +{ + struct psi_group *group = (struct psi_group *)data; + struct sched_param param = { + .sched_priority = 1, + }; + + sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + + while (true) { + wait_event_interruptible(group->poll_wait, + atomic_cmpxchg(&group->poll_wakeup, 1, 0) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + psi_poll_work(group); + } + return 0; +} + +static void poll_timer_fn(struct timer_list *t) +{ + struct psi_group *group = from_timer(group, t, poll_timer); + + atomic_set(&group->poll_wakeup, 1); + wake_up_interruptible(&group->poll_wait); +} + static void record_times(struct psi_group_cpu *groupc, int cpu, bool memstall_tick) { @@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, mutex_lock(&group->trigger_lock); - if (!rcu_access_pointer(group->poll_kworker)) { - struct sched_param param = { - .sched_priority = 1, - }; - struct kthread_worker *kworker; + if (!rcu_access_pointer(group->poll_task)) { + struct task_struct *task; - kworker = kthread_create_worker(0, "psimon"); - if (IS_ERR(kworker)) { + task = kthread_create(psi_poll_worker, group, "psimon"); + if (IS_ERR(task)) { kfree(t); mutex_unlock(&group->trigger_lock); - return ERR_CAST(kworker); + return ERR_CAST(task); } - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); - kthread_init_delayed_work(&group->poll_work, - psi_poll_work); - rcu_assign_pointer(group->poll_kworker, kworker); + atomic_set(&group->poll_wakeup, 0); + init_waitqueue_head(&group->poll_wait); + wake_up_process(task); + timer_setup(&group->poll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->poll_task, task); } list_add(&t->node, &group->triggers); @@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref) { struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); struct psi_group *group = t->group; - struct kthread_worker *kworker_to_destroy = NULL; + struct task_struct *task_to_destroy = NULL; if (static_branch_likely(&psi_disabled)) return; @@ -1158,13 +1175,13 @@ static void psi_trigger_destroy(struct kref *ref) period = min(period, div_u64(tmp->win.size, UPDATES_PER_WINDOW)); group->poll_min_period = period; - /* Destroy poll_kworker when the last trigger is destroyed */ + /* Destroy poll_task when the last trigger is destroyed */ if (group->poll_states == 0) { group->polling_until = 0; - kworker_to_destroy = rcu_dereference_protected( - group->poll_kworker, + task_to_destroy = rcu_dereference_protected( + group->poll_task, lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } } @@ -1172,25 +1189,23 @@ static void psi_trigger_destroy(struct kref *ref) /* * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_kworker RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_kworker + * poll_task RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_task */ synchronize_rcu(); /* * Destroy the kworker after releasing trigger_lock to prevent a * deadlock while waiting for psi_poll_work to acquire trigger_lock */ - if (kworker_to_destroy) { + if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_kworker. + * can no longer be found through group->poll_task. * But it might have been already scheduled before * that - deschedule it cleanly before destroying it. */ - kthread_cancel_delayed_work_sync(&group->poll_work); - atomic_set(&group->poll_scheduled, 0); - - kthread_destroy_worker(kworker_to_destroy); + del_timer_sync(&group->poll_timer); + kthread_stop(task_to_destroy); } kfree(t); } -- cgit v1.2.3 From 043eb8e1051143a24811e6f35c276e35ae8247b6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 27 May 2020 16:29:08 +0200 Subject: kthread: Switch to cpu_possible_mask Next patch will switch unbound kernel threads mask to housekeeping_cpumask(), a subset of cpu_possible_mask. So in order to ease bisection, lets first switch kthreads default affinity from cpu_all_mask to cpu_possible_mask. It looks safe to do so as cpu_possible_mask seem to be initialized at setup_arch() time, way before kthreadd is created. Suggested-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Marcelo Tosatti Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200527142909.23372-2-frederic@kernel.org --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 8e3d2d7fdf5e..b86d37cda109 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -383,7 +383,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_all_mask); + set_cpus_allowed_ptr(task, cpu_possible_mask); } kfree(create); return task; @@ -608,7 +608,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_all_mask); + set_cpus_allowed_ptr(tsk, cpu_possible_mask); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; -- cgit v1.2.3 From 9cc5b8656892a72438ee7deb5e80f5be47643b8b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 27 May 2020 16:29:09 +0200 Subject: isolcpus: Affine unbound kernel threads to housekeeping cpus This is a kernel enhancement that configures the cpu affinity of kernel threads via kernel boot option nohz_full=. When this option is specified, the cpumask is immediately applied upon kthread launch. This does not affect kernel threads that specify cpu and node. This allows CPU isolation (that is not allowing certain threads to execute on certain CPUs) without using the isolcpus=domain parameter, making it possible to enable load balancing on such CPUs during runtime (see kernel-parameters.txt). Note-1: this is based off on Wind River's patch at https://github.com/starlingx-staging/stx-integ/blob/master/kernel/kernel-std/centos/patches/affine-compute-kernel-threads.patch Difference being that this patch is limited to modifying kernel thread cpumask. Behaviour of other threads can be controlled via cgroups or sched_setaffinity. Note-2: Wind River's patch was based off Christoph Lameter's patch at https://lwn.net/Articles/565932/ with the only difference being the kernel parameter changed from kthread to kthread_cpus. Signed-off-by: Frederic Weisbecker Signed-off-by: Marcelo Tosatti Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200527142909.23372-3-frederic@kernel.org --- kernel/kthread.c | 6 ++++-- kernel/sched/isolation.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index b86d37cda109..032b610912b0 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_possible_mask); + set_cpus_allowed_ptr(task, + housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; @@ -608,7 +610,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_possible_mask); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 808244f3ddd9..5a6ea03f9882 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | + HK_FLAG_MISC | HK_FLAG_KTHREAD; return housekeeping_setup(str, flags); } -- cgit v1.2.3 From b4098bfc5efb1fd7ecf40165132a1283aeea3500 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Jul 2019 16:54:10 +0200 Subject: sched/deadline: Impose global limits on sched_attr::sched_period Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726161357.397880775@infradead.org --- kernel/sched/deadline.c | 23 +++++++++++++++++++++-- kernel/sysctl.c | 14 ++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 504d2f51b0d6..f31964ad9c2e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2634,6 +2634,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_flags = dl_se->flags; } +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting rediculous long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ + /* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal @@ -2646,6 +2654,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2669,12 +2679,21 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index db1ce7af2563..4aea67d3d552 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1779,6 +1779,20 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_rr_timeslice_ms", .data = &sysctl_sched_rr_timeslice, -- cgit v1.2.3 From 3ea2f097b17e13a8280f1f9386c331b326a3dbef Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 9 Jun 2020 14:37:48 +0200 Subject: sched/fair: Fix NOHZ next idle balance With commit: 'b7031a02ec75 ("sched/fair: Add NOHZ_STATS_KICK")' rebalance_domains of the local cfs_rq happens before others idle cpus have updated nohz.next_balance and its value is overwritten. Move the update of nohz.next_balance for other idles cpus before balancing and updating the next_balance of local cfs_rq. Also, the nohz.next_balance is now updated only if all idle cpus got a chance to rebalance their domains and the idle balance has not been aborted because of new activities on the CPU. In case of need_resched, the idle load balance will be kick the next jiffie in order to address remaining ilb. Fixes: b7031a02ec75 ("sched/fair: Add NOHZ_STATS_KICK") Reported-by: Peng Liu Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20200609123748.18636-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a785a9b262dd..295c9ffa850b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10022,7 +10022,12 @@ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); @@ -10343,6 +10348,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + /* Newly idle CPU doesn't need an update */ if (idle != CPU_NEWLY_IDLE) { update_blocked_averages(this_cpu); @@ -10363,14 +10376,6 @@ abort: if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; - return ret; } -- cgit v1.2.3 From 9b1b234bb86bcdcdb142e900d39b599185465dbb Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Tue, 9 Jun 2020 23:09:36 +0800 Subject: sched: correct SD_flags returned by tl->sd_flags() During sched domain init, we check whether non-topological SD_flags are returned by tl->sd_flags(), if found, fire a waning and correct the violation, but the code failed to correct the violation. Correct this. Fixes: 143e1e28cb40 ("sched: Rework sched_domain topology definition") Signed-off-by: Peng Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20200609150936.GA13060@iZj6chx1xj0e0buvshuecpZ --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ba81187bb7af..9079d865a935 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl, sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; + sd_flags &= TOPOLOGY_SD_FLAGS; /* Apply detected topology flags */ sd_flags |= dflags; -- cgit v1.2.3 From c81b89329933c6c0be809d4c0d2cb57c49153ee3 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 20 May 2020 15:42:39 +0200 Subject: sched/deadline: Optimize dl_bw_cpus() Return the weight of the root domain (rd) span in case it is a subset of the cpu_active_mask. Continue to compute the number of CPUs over rd span and cpu_active_mask when in hotplug. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-2-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f31964ad9c2e..ec90265e9d8e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,10 +54,16 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; -- cgit v1.2.3 From fc9dc698472aa460a8b3b036d9b1d0b751f12f58 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 20 May 2020 15:42:40 +0200 Subject: sched/deadline: Add dl_bw_capacity() Capacity-aware SCHED_DEADLINE Admission Control (AC) needs root domain (rd) CPU capacity sum. Introduce dl_bw_capacity() which for a symmetric rd w/ a CPU capacity of SCHED_CAPACITY_SCALE simply relies on dl_bw_cpus() to return #CPUs multiplied by SCHED_CAPACITY_SCALE. For an asymmetric rd or a CPU capacity < SCHED_CAPACITY_SCALE it computes the CPU capacity sum over rd span and cpu_active_mask. A 'XXX Fix:' comment was added to highlight that if 'rq->rd == def_root_domain' AC should be performed against the capacity of the CPU the task is running on rather the rd CPU capacity sum. This issue already exists w/o capacity awareness. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-3-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ec90265e9d8e..01f474a5bd14 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -69,6 +69,34 @@ static inline int dl_bw_cpus(int i) return cpus; } + +static inline unsigned long __dl_bw_capacity(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + unsigned long cap = 0; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cap += capacity_orig_of(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity) && + capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + return __dl_bw_capacity(i); + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -79,6 +107,11 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} #endif static inline -- cgit v1.2.3 From 60ffd5edc5e4fa69622c125c54ef8e7d5d894af8 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 20 May 2020 15:42:41 +0200 Subject: sched/deadline: Improve admission control for asymmetric CPU capacities The current SCHED_DEADLINE (DL) admission control ensures that sum of reserved CPU bandwidth < x * M where x = /proc/sys/kernel/sched_rt_{runtime,period}_us M = # CPUs in root domain. DL admission control works well for homogeneous systems where the capacity of all CPUs are equal (1024). I.e. bounded tardiness for DL and non-starvation of non-DL tasks is guaranteed. But on heterogeneous systems where capacity of CPUs are different it could fail by over-allocating CPU time on smaller capacity CPUs. On an Arm big.LITTLE/DynamIQ system DL tasks can easily starve other tasks making it unusable. Fix this by explicitly considering the CPU capacity in the DL admission test by replacing M with the root domain CPU capacity sum. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-4-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 30 +++++++++++++++++------------- kernel/sched/sched.h | 6 +++--- 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 01f474a5bd14..9ebd0a9241ed 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2590,11 +2590,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2609,15 +2610,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2772,19 +2775,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { + unsigned long flags, cap; unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; - int cpus, ret; - unsigned long flags; + int ret; dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + cap = dl_bw_capacity(dest_cpu); + overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); if (overflow) { ret = -EBUSY; } else { @@ -2794,6 +2797,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ + int cpus = dl_bw_cpus(dest_cpu); + __dl_add(dl_b, p->dl.dl_bw, cpus); ret = 0; } @@ -2826,16 +2831,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, bool dl_cpu_busy(unsigned int cpu) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow; - int cpus; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + cap = dl_bw_capacity(cpu); + overflow = __dl_overflow(dl_b, cap, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8d5d06881294..91b250f265c0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -310,11 +310,11 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) __dl_update(dl_b, -((s32)tsk_bw / cpus)); } -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, + u64 old_bw, u64 new_bw) { return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; } extern void init_dl_bw(struct dl_bw *dl_b); -- cgit v1.2.3 From b4118988fdcb4554ea6687dd8ff68bcab690b8ea Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 20 May 2020 15:42:42 +0200 Subject: sched/deadline: Make DL capacity-aware The current SCHED_DEADLINE (DL) scheduler uses a global EDF scheduling algorithm w/o considering CPU capacity or task utilization. This works well on homogeneous systems where DL tasks are guaranteed to have a bounded tardiness but presents issues on heterogeneous systems. A DL task can migrate to a CPU which does not have enough CPU capacity to correctly serve the task (e.g. a task w/ 70ms runtime and 100ms period on a CPU w/ 512 capacity). Add the DL fitness function dl_task_fits_capacity() for DL admission control on heterogeneous systems. A task fits onto a CPU if: CPU original capacity / 1024 >= task runtime / task deadline Use this function on heterogeneous systems to try to find a CPU which meets this criterion during task wakeup, push and offline migration. On homogeneous systems the original behavior of the DL admission control should be retained. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-5-dietmar.eggemann@arm.com --- kernel/sched/cpudeadline.c | 14 +++++++++++++- kernel/sched/deadline.c | 18 ++++++++++++++---- kernel/sched/sched.h | 15 +++++++++++++++ 3 files changed, 42 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5cc4012572ec..8630f2a40a3f 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,7 +121,19 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { - return 1; + int cpu; + + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 1; + + /* Ensure the capacity of the CPUs fits the task. */ + for_each_cpu(cpu, later_mask) { + if (!dl_task_fits_capacity(p, cpu)) + cpumask_clear_cpu(cpu, later_mask); + } + + if (!cpumask_empty(later_mask)) + return 1; } else { int best_cpu = cpudl_maximum(cp); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ebd0a9241ed..84e84ba0b00a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1643,6 +1643,7 @@ static int select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; + bool select_rq; struct rq *rq; if (sd_flag != SD_BALANCE_WAKE) @@ -1662,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91b250f265c0..336887607b3d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -317,6 +317,21 @@ static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; } +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the CPU original capacity of the + * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the + * task and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; +} + extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); -- cgit v1.2.3 From 23e71d8ba42933bff12e453858fd68c073bc5258 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 20 May 2020 15:42:43 +0200 Subject: sched/deadline: Implement fallback mechanism for !fit case When a task has a runtime that cannot be served within the scheduling deadline by any of the idle CPU (later_mask) the task is doomed to miss its deadline. This can happen since the SCHED_DEADLINE admission control guarantees only bounded tardiness and not the hard respect of all deadlines. In this case try to select the idle CPU with the largest CPU capacity to minimize tardiness. Favor task_cpu(p) if it has max capacity of !fitting CPUs so that find_later_rq() can potentially still return it (most likely cache-hot) early. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-6-dietmar.eggemann@arm.com --- kernel/sched/cpudeadline.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8630f2a40a3f..8cb06c8c7eb1 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,19 +121,31 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { - int cpu; + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; if (!static_branch_unlikely(&sched_asym_cpucapacity)) return 1; /* Ensure the capacity of the CPUs fits the task. */ for_each_cpu(cpu, later_mask) { - if (!dl_task_fits_capacity(p, cpu)) + if (!dl_task_fits_capacity(p, cpu)) { cpumask_clear_cpu(cpu, later_mask); + + cap = capacity_orig_of(cpu); + + if (cap > max_cap || + (cpu == task_cpu(p) && cap == max_cap)) { + max_cap = cap; + max_cpu = cpu; + } + } } - if (!cpumask_empty(later_mask)) - return 1; + if (cpumask_empty(later_mask)) + cpumask_set_cpu(max_cpu, later_mask); + + return 1; } else { int best_cpu = cpudl_maximum(cp); -- cgit v1.2.3 From c49694173da004b1b16082f82f28bd625415fbb2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 2 Jun 2020 21:50:02 +0200 Subject: sched/deadline: Fix a typo in a comment s/deadine/deadline/ Signed-off-by: Christophe JAILLET Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200602195002.677448-1-christophe.jaillet@wanadoo.fr --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 84e84ba0b00a..d4708e29008f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1137,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the -- cgit v1.2.3 From 87e867b4269f29dac8190bca13912d08163a277f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 12 Jun 2020 17:47:03 +0200 Subject: sched/pelt: Cleanup PELT divider Factorize in a single place the calculation of the divider to be used to to compute *_avg from *_sum value Suggested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200612154703.23555-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 32 ++++++++++++++++++-------------- kernel/sched/pelt.c | 2 +- kernel/sched/pelt.h | 5 +++++ 3 files changed, 24 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 295c9ffa850b..0424a0af5f87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3094,7 +3094,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #ifdef CONFIG_SMP do { - u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); @@ -3440,16 +3440,18 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider; /* Nothing to update */ if (!delta) return; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_sum = se->avg.util_avg * divider; @@ -3463,16 +3465,18 @@ static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider; /* Nothing to update */ if (!delta) return; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; se->avg.runnable_sum = se->avg.runnable_avg * divider; @@ -3500,7 +3504,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + divider = get_pelt_divider(&cfs_rq->avg); if (runnable_sum >= 0) { /* @@ -3646,7 +3650,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq->removed.nr) { unsigned long r; - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); @@ -3701,7 +3705,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); /* * When we attach the @se to the @cfs_rq, we must align the decay diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index dea5567e4f72..11bea3b08115 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -262,7 +262,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(sa); /* * Step 2: update *_avg. diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index eb034d9f024d..795e43e02afc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +static inline u32 get_pelt_divider(struct sched_avg *avg) +{ + return LOAD_AVG_MAX - 1024 + avg->period_contrib; +} + /* * When a task is dequeued, its estimated utilization should not be update if * its util_avg has not been updated at least once. -- cgit v1.2.3 From 7318d4cc14c8c8a5dde2b0b72ea50fd2545f0b7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched: Provide sched_set_fifo() SCHED_FIFO (or any static priority scheduler) is a broken scheduler model; it is fundamentally incapable of resource management, the one thing an OS is actually supposed to do. It is impossible to compose static priority workloads. One cannot take two well designed and functional static priority workloads and mash them together and still expect them to work. Therefore it doesn't make sense to expose the priority field; the kernel is fundamentally incapable of setting a sensible value, it needs systems knowledge that it doesn't have. Take away sched_setschedule() / sched_setattr() from modules and replace them with: - sched_set_fifo(p); create a FIFO task (at prio 50) - sched_set_fifo_low(p); create a task higher than NORMAL, which ends up being a FIFO task at prio 1. - sched_set_normal(p, nice); (re)set the task to normal This stops the proliferation of randomly chosen, and irrelevant, FIFO priorities that dont't really mean anything anyway. The system administrator/integrator, whoever has insight into the actual system design and requirements (userspace) can set-up appropriate priorities if and when needed. Cc: airlied@redhat.com Cc: alexander.deucher@amd.com Cc: awalls@md.metrocast.net Cc: axboe@kernel.dk Cc: broonie@kernel.org Cc: daniel.lezcano@linaro.org Cc: gregkh@linuxfoundation.org Cc: hannes@cmpxchg.org Cc: herbert@gondor.apana.org.au Cc: hverkuil@xs4all.nl Cc: john.stultz@linaro.org Cc: nico@fluxnic.net Cc: paulmck@kernel.org Cc: rafael.j.wysocki@intel.com Cc: rmk+kernel@arm.linux.org.uk Cc: sudeep.holla@arm.com Cc: tglx@linutronix.de Cc: ulf.hansson@linaro.org Cc: wim@linux-watchdog.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Paul E. McKenney --- kernel/sched/core.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0208b71bef80..40d3939b0520 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5124,6 +5124,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, * @policy: new policy. * @param: structure containing the new RT priority. * + * Use sched_set_fifo(), read its comment. + * * Return: 0 on success. An error code otherwise. * * NOTE that the task may be already dead. @@ -5166,6 +5168,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, } EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +int sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +int sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +int sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + return sched_setattr_nocheck(p, &attr); +} +EXPORT_SYMBOL_GPL(sched_set_normal); + static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { -- cgit v1.2.3 From 7a40798c714ff462863352d490b382515daba49e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,irq: Convert to sched_set_fifo() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively no change. Cc: tglx@linutronix.de Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar --- kernel/irq/manage.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 761911168438..06b62746e1df 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1271,9 +1271,6 @@ static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; - struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, @@ -1281,13 +1278,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); - param.sched_priority -= 1; } if (IS_ERR(t)) return PTR_ERR(t); - sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); + sched_set_fifo(t); /* * We keep the reference to the task struct even if -- cgit v1.2.3 From 93db9129fa4beb78e2227554d237e8db04ca514c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,locktorture: Convert to sched_set_fifo() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively changes prio from 99 to 50. Cc: paulmck@kernel.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Reviewed-by: Paul E. McKenney --- kernel/locking/locktorture.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..fca41a7ded0c 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) static void torture_rtmutex_boost(struct torture_random_state *trsp) { - int policy; - struct sched_param param; const unsigned int factor = 50000; /* yes, quite arbitrary */ if (!rt_task(current)) { @@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (trsp && !(torture_random(trsp) % (cxt.nrealwriters_stress * factor))) { - policy = SCHED_FIFO; - param.sched_priority = MAX_RT_PRIO - 1; + sched_set_fifo(current); } else /* common case, do nothing */ return; } else { @@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (!trsp || !(torture_random(trsp) % (cxt.nrealwriters_stress * factor * 2))) { - policy = SCHED_NORMAL; - param.sched_priority = 0; + sched_set_normal(current, 0); } else /* common case, do nothing */ return; } - - sched_setscheduler_nocheck(current, policy, ¶m); } static void torture_rtmutex_delay(struct torture_random_state *trsp) -- cgit v1.2.3 From b1433395c4cc078b4382429e6d7dd1721714094f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,rcuperf: Convert to sched_set_fifo_low() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively no change. Cc: paulmck@kernel.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Reviewed-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..64249a045232 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -354,7 +354,6 @@ rcu_perf_writer(void *arg) int i_max; long me = (long)arg; struct rcu_head *rhp = NULL; - struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; u64 *wdp; @@ -363,8 +362,7 @@ rcu_perf_writer(void *arg) VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - sp.sched_priority = 1; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_set_fifo_low(current); if (holdoff) schedule_timeout_uninterruptible(holdoff * HZ); @@ -420,9 +418,7 @@ retry: started = true; if (!done && i >= MIN_MEAS) { done = true; - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, - SCHED_NORMAL, &sp); + sched_set_normal(current, 0); pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= -- cgit v1.2.3 From ce1c8afd3f3119ddaddcdff27579f5723f55c75e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,rcutorture: Convert to sched_set_fifo_low() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively no change. Cc: paulmck@kernel.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Reviewed-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..bbc3c8a9e7b9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -889,13 +889,11 @@ static int rcu_torture_boost(void *arg) unsigned long endtime; unsigned long oldstarttime; struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { + if (sched_set_fifo_low(current) < 0) { VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); n_rcu_torture_boost_rterror++; } -- cgit v1.2.3 From 2cca5426b9c108998bc03230cd6ae440f3e205ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,psi: Convert to sched_set_fifo_low() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively no change. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner --- kernel/sched/psi.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e53b711bd643..967732c0766c 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -616,11 +616,8 @@ out: static int psi_poll_worker(void *data) { struct psi_group *group = (struct psi_group *)data; - struct sched_param param = { - .sched_priority = 1, - }; - sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + sched_set_fifo_low(current); while (true) { wait_event_interruptible(group->poll_wait, -- cgit v1.2.3 From 616d91b68cd56bcb1954b6a5af7d542401fde772 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched: Remove sched_setscheduler*() EXPORTs Now that nothing (modular) still uses sched_setscheduler(), remove the exports. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar --- kernel/sched/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 40d3939b0520..f882d3d74dad 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5135,13 +5135,11 @@ int sched_setscheduler(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, true); } -EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } -EXPORT_SYMBOL_GPL(sched_setattr); int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { @@ -5166,7 +5164,6 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); /* * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally -- cgit v1.2.3 From 8b700983de82f79e05b2c1136d6513ea4c9b22c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Apr 2020 13:10:04 +0200 Subject: sched: Remove sched_set_*() return value Ingo suggested that since the new sched_set_*() functions are implemented using the 'nocheck' variants, they really shouldn't ever fail, so remove the return value. Cc: axboe@kernel.dk Cc: daniel.lezcano@linaro.org Cc: sudeep.holla@arm.com Cc: airlied@redhat.com Cc: broonie@kernel.org Cc: paulmck@kernel.org Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar --- kernel/rcu/rcutorture.c | 5 +---- kernel/sched/core.c | 12 ++++++------ 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bbc3c8a9e7b9..b4c1146de414 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -893,10 +893,7 @@ static int rcu_torture_boost(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ - if (sched_set_fifo_low(current) < 0) { - VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } + sched_set_fifo_low(current); init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f882d3d74dad..41d3778ea80e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5183,30 +5183,30 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, * The administrator _MUST_ configure the system, the kernel simply doesn't * know enough information to make a sensible choice. */ -int sched_set_fifo(struct task_struct *p) +void sched_set_fifo(struct task_struct *p) { struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; - return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp); + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); } EXPORT_SYMBOL_GPL(sched_set_fifo); /* * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. */ -int sched_set_fifo_low(struct task_struct *p) +void sched_set_fifo_low(struct task_struct *p) { struct sched_param sp = { .sched_priority = 1 }; - return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp); + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); } EXPORT_SYMBOL_GPL(sched_set_fifo_low); -int sched_set_normal(struct task_struct *p, int nice) +void sched_set_normal(struct task_struct *p, int nice) { struct sched_attr attr = { .sched_policy = SCHED_NORMAL, .sched_nice = nice, }; - return sched_setattr_nocheck(p, &attr); + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); } EXPORT_SYMBOL_GPL(sched_set_normal); -- cgit v1.2.3 From 7fac96f2be3bbdb0b21c0de6812e965868137ad4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 28 May 2020 09:35:11 -0500 Subject: tracing/probe: Replace zero-length array with flexible-array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://github.com/KSPP/linux/issues/21 Signed-off-by: Gustavo A. R. Silva --- kernel/trace/trace_probe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index a0ff9e200ef6..a22b62813f8c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -236,7 +236,7 @@ struct trace_probe_event { struct trace_event_call call; struct list_head files; struct list_head probes; - struct trace_uprobe_filter filter[0]; + struct trace_uprobe_filter filter[]; }; struct trace_probe { -- cgit v1.2.3 From 60997c3d45d9a67daf01c56d805ae4fec37e0bd8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 3 Jun 2020 21:48:55 +0200 Subject: close_range: add CLOSE_RANGE_UNSHARE One of the use-cases of close_range() is to drop file descriptors just before execve(). This would usually be expressed in the sequence: unshare(CLONE_FILES); close_range(3, ~0U); as pointed out by Linus it might be desirable to have this be a part of close_range() itself under a new flag CLOSE_RANGE_UNSHARE. This expands {dup,unshare)_fd() to take a max_fds argument that indicates the maximum number of file descriptors to copy from the old struct files. When the user requests that all file descriptors are supposed to be closed via close_range(min, max) then we can cap via unshare_fd(min) and hence don't need to do any of the heavy fput() work for everything above min. The patch makes it so that if CLOSE_RANGE_UNSHARE is requested and we do in fact currently share our file descriptor table we create a new private copy. We then close all fds in the requested range and finally after we're done we install the new fd table. Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- kernel/fork.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..8948121c8454 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1474,7 +1474,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) goto out; } - newf = dup_fd(oldf, &error); + newf = dup_fd(oldf, NR_OPEN_MAX, &error); if (!newf) goto out; @@ -2907,14 +2907,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, &error); + *new_fdp = dup_fd(fd, max_fds, &error); if (!*new_fdp) return error; } @@ -2974,7 +2975,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, &new_fd); + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3063,7 +3064,7 @@ int unshare_files(struct files_struct **displaced) struct files_struct *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, ©); + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); if (error || !copy) { *displaced = NULL; return error; -- cgit v1.2.3 From e571d4ee334719727f22cce30c4c74471d4ef68a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Jun 2020 00:33:12 +0200 Subject: nsproxy: restore EINVAL for non-namespace file descriptor The LTP testsuite reported a regression where users would now see EBADF returned instead of EINVAL when an fd was passed that referred to an open file but the file was not a nsfd. Fix this by continuing to report EINVAL. Reported-by: kernel test robot Cc: Jan Stancek Cc: Cyril Hrubis Link: https://lore.kernel.org/lkml/20200615085836.GR12456@shao2-debian Fixes: 303cc571d107 ("nsproxy: attach to namespaces via pidfds") Signed-off-by: Christian Brauner --- kernel/nsproxy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b03df67621d0..cd356630a311 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -531,7 +531,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) } else if (!IS_ERR(pidfd_pid(file))) { err = check_setns_flags(flags); } else { - err = -EBADF; + err = -EINVAL; } if (err) goto out; -- cgit v1.2.3 From 6743ad432ec92e680cd0d9db86cb17b949cf5a43 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 12 May 2020 17:02:33 +0900 Subject: kprobes: Suppress the suspicious RCU warning on kprobes Anders reported that the lockdep warns that suspicious RCU list usage in register_kprobe() (detected by CONFIG_PROVE_RCU_LIST.) This is because get_kprobe() access kprobe_table[] by hlist_for_each_entry_rcu() without rcu_read_lock. If we call get_kprobe() from the breakpoint handler context, it is run with preempt disabled, so this is not a problem. But in other cases, instead of rcu_read_lock(), we locks kprobe_mutex so that the kprobe_table[] is not updated. So, current code is safe, but still not good from the view point of RCU. Joel suggested that we can silent that warning by passing lockdep_is_held() to the last argument of hlist_for_each_entry_rcu(). Add lockdep_is_held(&kprobe_mutex) at the end of the hlist_for_each_entry_rcu() to suppress the warning. Link: http://lkml.kernel.org/r/158927055350.27680.10261450713467997503.stgit@devnote2 Reported-by: Anders Roxell Suggested-by: Joel Fernandes Reviewed-by: Joel Fernandes (Google) Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 50cd84f53df0..8b2fd4145ab3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -326,7 +326,8 @@ struct kprobe *get_kprobe(void *addr) struct kprobe *p; head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; - hlist_for_each_entry_rcu(p, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist, + lockdep_is_held(&kprobe_mutex)) { if (p->addr == addr) return p; } -- cgit v1.2.3 From 7e6a71d8e60187726e29b13d9e9b23b77026c17a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 12 May 2020 17:02:44 +0900 Subject: kprobes: Use non RCU traversal APIs on kprobe_tables if possible Current kprobes uses RCU traversal APIs on kprobe_tables even if it is safe because kprobe_mutex is locked. Make those traversals to non-RCU APIs where the kprobe_mutex is locked. Link: http://lkml.kernel.org/r/158927056452.27680.9710575332163005121.stgit@devnote2 Reviewed-by: Joel Fernandes (Google) Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8b2fd4145ab3..ceb0e273bd69 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -46,6 +46,11 @@ static int kprobes_initialized; +/* kprobe_table can be accessed by + * - Normal hlist traversal and RCU add/del under kprobe_mutex is held. + * Or + * - RCU hlist traversal under disabling preempt (breakpoint handlers) + */ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; @@ -850,7 +855,7 @@ static void optimize_all_kprobes(void) kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) + hlist_for_each_entry(p, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } @@ -877,7 +882,7 @@ static void unoptimize_all_kprobes(void) kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) { + hlist_for_each_entry(p, head, hlist) { if (!kprobe_disabled(p)) unoptimize_kprobe(p, false); } @@ -1500,12 +1505,14 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p) { struct kprobe *ap, *list_p; + lockdep_assert_held(&kprobe_mutex); + ap = get_kprobe(p->addr); if (unlikely(!ap)) return NULL; if (p != ap) { - list_for_each_entry_rcu(list_p, &ap->list, list) + list_for_each_entry(list_p, &ap->list, list) if (list_p == p) /* kprobe p is a valid probe */ goto valid; @@ -1670,7 +1677,9 @@ static int aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; - list_for_each_entry_rcu(kp, &ap->list, list) + lockdep_assert_held(&kprobe_mutex); + + list_for_each_entry(kp, &ap->list, list) if (!kprobe_disabled(kp)) /* * There is an active probe on the list. @@ -1749,7 +1758,7 @@ static int __unregister_kprobe_top(struct kprobe *p) else { /* If disabling probe has special handlers, update aggrprobe */ if (p->post_handler && !kprobe_gone(p)) { - list_for_each_entry_rcu(list_p, &ap->list, list) { + list_for_each_entry(list_p, &ap->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; } @@ -2063,13 +2072,15 @@ static void kill_kprobe(struct kprobe *p) { struct kprobe *kp; + lockdep_assert_held(&kprobe_mutex); + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* * If this is an aggr_kprobe, we have to list all the * chained probes and mark them GONE. */ - list_for_each_entry_rcu(kp, &p->list, list) + list_for_each_entry(kp, &p->list, list) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; kill_optimized_kprobe(p); @@ -2313,7 +2324,7 @@ static int kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) + hlist_for_each_entry(p, head, hlist) if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2551,7 +2562,7 @@ static int arm_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; /* Arm all kprobes on a best-effort basis */ - hlist_for_each_entry_rcu(p, head, hlist) { + hlist_for_each_entry(p, head, hlist) { if (!kprobe_disabled(p)) { err = arm_kprobe(p); if (err) { @@ -2594,7 +2605,7 @@ static int disarm_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; /* Disarm all kprobes on a best-effort basis */ - hlist_for_each_entry_rcu(p, head, hlist) { + hlist_for_each_entry(p, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { err = disarm_kprobe(p, false); if (err) { -- cgit v1.2.3 From 1a0aa991a6274161c95a844c58cfb801d681eb59 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 12 May 2020 17:02:56 +0900 Subject: kprobes: Fix to protect kick_kprobe_optimizer() by kprobe_mutex In kprobe_optimizer() kick_kprobe_optimizer() is called without kprobe_mutex, but this can race with other caller which is protected by kprobe_mutex. To fix that, expand kprobe_mutex protected area to protect kick_kprobe_optimizer() call. Link: http://lkml.kernel.org/r/158927057586.27680.5036330063955940456.stgit@devnote2 Fixes: cd7ebe2298ff ("kprobes: Use text_poke_smp_batch for optimizing") Cc: Ingo Molnar Cc: "Gustavo A . R . Silva" Cc: Anders Roxell Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: David Miller Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Ziqian SUN Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ceb0e273bd69..0e185763578b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -592,11 +592,12 @@ static void kprobe_optimizer(struct work_struct *work) mutex_unlock(&module_mutex); mutex_unlock(&text_mutex); cpus_read_unlock(); - mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) kick_kprobe_optimizer(); + + mutex_unlock(&kprobe_mutex); } /* Wait for completing optimization and unoptimization */ -- cgit v1.2.3 From 75ddf64dd276e3fc8906f27549afa229798ad916 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 12 May 2020 17:03:07 +0900 Subject: kprobes: Remove redundant arch_disarm_kprobe() call Fix to remove redundant arch_disarm_kprobe() call in force_unoptimize_kprobe(). This arch_disarm_kprobe() will be invoked if the kprobe is optimized but disabled, but that means the kprobe (optprobe) is unused (and unoptimized) state. In that case, unoptimize_kprobe() puts it in freeing_list and kprobe_optimizer (do_unoptimize_kprobes()) automatically disarm it. Thus this arch_disarm_kprobe() is redundant. Link: http://lkml.kernel.org/r/158927058719.27680.17183632908465341189.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0e185763578b..5cb7791c16b3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -675,8 +675,6 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op) lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - if (kprobe_disabled(&op->kp)) - arch_disarm_kprobe(&op->kp); } /* Unoptimize a kprobe if p is optimized */ -- cgit v1.2.3 From 9b38cc704e844e41d9cf74e647bff1d249512cb3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 May 2020 17:03:18 +0900 Subject: kretprobe: Prevent triggering kretprobe from within kprobe_flush_task Ziqian reported lockup when adding retprobe on _raw_spin_lock_irqsave. My test was also able to trigger lockdep output: ============================================ WARNING: possible recursive locking detected 5.6.0-rc6+ #6 Not tainted -------------------------------------------- sched-messaging/2767 is trying to acquire lock: ffffffff9a492798 (&(kretprobe_table_locks[i].lock)){-.-.}, at: kretprobe_hash_lock+0x52/0xa0 but task is already holding lock: ffffffff9a491a18 (&(kretprobe_table_locks[i].lock)){-.-.}, at: kretprobe_trampoline+0x0/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(kretprobe_table_locks[i].lock)); lock(&(kretprobe_table_locks[i].lock)); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by sched-messaging/2767: #0: ffffffff9a491a18 (&(kretprobe_table_locks[i].lock)){-.-.}, at: kretprobe_trampoline+0x0/0x50 stack backtrace: CPU: 3 PID: 2767 Comm: sched-messaging Not tainted 5.6.0-rc6+ #6 Call Trace: dump_stack+0x96/0xe0 __lock_acquire.cold.57+0x173/0x2b7 ? native_queued_spin_lock_slowpath+0x42b/0x9e0 ? lockdep_hardirqs_on+0x590/0x590 ? __lock_acquire+0xf63/0x4030 lock_acquire+0x15a/0x3d0 ? kretprobe_hash_lock+0x52/0xa0 _raw_spin_lock_irqsave+0x36/0x70 ? kretprobe_hash_lock+0x52/0xa0 kretprobe_hash_lock+0x52/0xa0 trampoline_handler+0xf8/0x940 ? kprobe_fault_handler+0x380/0x380 ? find_held_lock+0x3a/0x1c0 kretprobe_trampoline+0x25/0x50 ? lock_acquired+0x392/0xbc0 ? _raw_spin_lock_irqsave+0x50/0x70 ? __get_valid_kprobe+0x1f0/0x1f0 ? _raw_spin_unlock_irqrestore+0x3b/0x40 ? finish_task_switch+0x4b9/0x6d0 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 The code within the kretprobe handler checks for probe reentrancy, so we won't trigger any _raw_spin_lock_irqsave probe in there. The problem is in outside kprobe_flush_task, where we call: kprobe_flush_task kretprobe_table_lock raw_spin_lock_irqsave _raw_spin_lock_irqsave where _raw_spin_lock_irqsave triggers the kretprobe and installs kretprobe_trampoline handler on _raw_spin_lock_irqsave return. The kretprobe_trampoline handler is then executed with already locked kretprobe_table_locks, and first thing it does is to lock kretprobe_table_locks ;-) the whole lockup path like: kprobe_flush_task kretprobe_table_lock raw_spin_lock_irqsave _raw_spin_lock_irqsave ---> probe triggered, kretprobe_trampoline installed ---> kretprobe_table_locks locked kretprobe_trampoline trampoline_handler kretprobe_hash_lock(current, &head, &flags); <--- deadlock Adding kprobe_busy_begin/end helpers that mark code with fake probe installed to prevent triggering of another kprobe within this code. Using these helpers in kprobe_flush_task, so the probe recursion protection check is hit and the probe is never set to prevent above lockup. Link: http://lkml.kernel.org/r/158927059835.27680.7011202830041561604.stgit@devnote2 Fixes: ef53d9c5e4da ("kprobes: improve kretprobe scalability with hashed locking") Cc: Ingo Molnar Cc: "Gustavo A . R . Silva" Cc: Anders Roxell Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: David Miller Cc: Ingo Molnar Cc: Peter Zijlstra Cc: stable@vger.kernel.org Reported-by: "Ziqian SUN (Zamir)" Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5cb7791c16b3..4a904cc56d68 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1241,6 +1241,26 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock); +struct kprobe kprobe_busy = { + .addr = (void *) get_kprobe, +}; + +void kprobe_busy_begin(void) +{ + struct kprobe_ctlblk *kcb; + + preempt_disable(); + __this_cpu_write(current_kprobe, &kprobe_busy); + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; +} + +void kprobe_busy_end(void) +{ + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); +} + /* * This function is called from finish_task_switch when task tk becomes dead, * so that we can recycle any function-return probe instances associated @@ -1258,6 +1278,8 @@ void kprobe_flush_task(struct task_struct *tk) /* Early boot. kretprobe_table_locks not yet initialized. */ return; + kprobe_busy_begin(); + INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; @@ -1271,6 +1293,8 @@ void kprobe_flush_task(struct task_struct *tk) hlist_del(&ri->hlist); kfree(ri); } + + kprobe_busy_end(); } NOKPROBE_SYMBOL(kprobe_flush_task); -- cgit v1.2.3 From 4649079b9de1ad86be9f4c989373adb8235a8485 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 9 Jun 2020 22:00:41 -0400 Subject: tracing: Make ftrace packed events have align of 1 When using trace-cmd on 5.6-rt for the function graph tracer, the output was corrupted. It gave output like this: funcgraph_entry: func=0xffffffff depth=38982 funcgraph_entry: func=0x1ffffffff depth=16044 funcgraph_exit: func=0xffffffff overrun=0x92539aaf00000000 calltime=0x92539c9900000072 rettime=0x100000072 depth=11084 funcgraph_exit: func=0xffffffff overrun=0x9253946e00000000 calltime=0x92539e2100000072 rettime=0x72 depth=26033702 funcgraph_entry: func=0xffffffff depth=85798 funcgraph_entry: func=0x1ffffffff depth=12044 The reason was because the tracefs/events/ftrace/funcgraph_entry/exit format file was incorrect. The -rt kernel adds more common fields to the trace events. Namely, common_migrate_disable and common_preempt_lazy_count. Each is one byte in size. This changes the alignment of the normal payload. Most events are aligned normally, but the function and function graph events are defined with a "PACKED" macro, that packs their payload. As the offsets displayed in the format files are now calculated by an aligned field, the aligned field for function and function graph events should be 1, not their normal alignment. With aligning of the funcgraph_entry event, the format file has: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:unsigned char common_migrate_disable; offset:8; size:1; signed:0; field:unsigned char common_preempt_lazy_count; offset:9; size:1; signed:0; field:unsigned long func; offset:16; size:8; signed:0; field:int depth; offset:24; size:4; signed:1; But the actual alignment is: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:unsigned char common_migrate_disable; offset:8; size:1; signed:0; field:unsigned char common_preempt_lazy_count; offset:9; size:1; signed:0; field:unsigned long func; offset:12; size:8; signed:0; field:int depth; offset:20; size:4; signed:1; Link: https://lkml.kernel.org/r/20200609220041.2a3b527f@oasis.local.home Cc: stable@vger.kernel.org Fixes: 04ae87a52074e ("ftrace: Rework event_create_dir()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 3 +++ kernel/trace/trace_entries.h | 14 +++++++------- kernel/trace/trace_export.c | 16 ++++++++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index def769df5bf1..13db4000af3f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -61,6 +61,9 @@ enum trace_type { #undef __field_desc #define __field_desc(type, container, item) +#undef __field_packed +#define __field_packed(type, container, item) + #undef __array #define __array(type, item, size) type item[size]; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index a523da0dae0a..18c4a58aff79 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -78,8 +78,8 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) - __field_desc( unsigned long, graph_ent, func ) - __field_desc( int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, func ) + __field_packed( int, graph_ent, depth ) ), F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) @@ -92,11 +92,11 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) - __field_desc( unsigned long, ret, func ) - __field_desc( unsigned long, ret, overrun ) - __field_desc( unsigned long long, ret, calltime) - __field_desc( unsigned long long, ret, rettime ) - __field_desc( int, ret, depth ) + __field_packed( unsigned long, ret, func ) + __field_packed( unsigned long, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + __field_packed( int, ret, depth ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 77ce5a3b6773..70d3d0a09053 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -45,6 +45,9 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __field_desc #define __field_desc(type, container, item) type item; +#undef __field_packed +#define __field_packed(type, container, item) type item; + #undef __array #define __array(type, item, size) type item[size]; @@ -85,6 +88,13 @@ static void __always_unused ____ftrace_check_##name(void) \ .size = sizeof(_type), .align = __alignof__(_type), \ is_signed_type(_type), .filter_type = _filter_type }, + +#undef __field_ext_packed +#define __field_ext_packed(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = 1, \ + is_signed_type(_type), .filter_type = _filter_type }, + #undef __field #define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER) @@ -94,6 +104,9 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __field_desc #define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) +#undef __field_packed +#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER) + #undef __array #define __array(_type, _item, _len) { \ .type = #_type"["__stringify(_len)"]", .name = #_item, \ @@ -129,6 +142,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \ #undef __field_desc #define __field_desc(type, container, item) +#undef __field_packed +#define __field_packed(type, container, item) + #undef __array #define __array(type, item, len) -- cgit v1.2.3 From 48a42f5d138435242529726b8802076a24b6db17 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 10 Jun 2020 11:32:51 +0800 Subject: trace: Fix typo in allocate_ftrace_ops()'s comment No functional change, just correct the word. Link: https://lkml.kernel.org/r/20200610033251.31713-1-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8a4c8d5c2c98..dd4dff71d89a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -42,7 +42,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) if (!ops) return -ENOMEM; - /* Currently only the non stack verision is supported */ + /* Currently only the non stack version is supported */ ops->func = function_trace_call; ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; -- cgit v1.2.3 From 3aa8fdc37d16735e8891035becf25b3857d3efe0 Mon Sep 17 00:00:00 2001 From: Vamshi K Sthambamkadi Date: Mon, 15 Jun 2020 20:00:38 +0530 Subject: tracing/probe: Fix memleak in fetch_op_data operations kmemleak report: [<57dcc2ca>] __kmalloc_track_caller+0x139/0x2b0 [] kstrndup+0x37/0x80 [] parse_probe_arg.isra.7+0x3cc/0x630 [<055bf2ba>] traceprobe_parse_probe_arg+0x2f5/0x810 [<655a7766>] trace_kprobe_create+0x2ca/0x950 [<4fc6a02a>] create_or_delete_trace_kprobe+0xf/0x30 [<6d1c8a52>] trace_run_command+0x67/0x80 [] trace_parse_run_command+0xa7/0x140 [] probes_write+0x10/0x20 [<2027641c>] __vfs_write+0x30/0x1e0 [<6a4aeee1>] vfs_write+0x96/0x1b0 [<3517fb7d>] ksys_write+0x53/0xc0 [] __ia32_sys_write+0x15/0x20 [] do_syscall_32_irqs_on+0x3d/0x260 [] do_fast_syscall_32+0x39/0xb0 [] entry_SYSENTER_32+0xaf/0x102 Post parse_probe_arg(), the FETCH_OP_DATA operation type is overwritten to FETCH_OP_ST_STRING, as a result memory is never freed since traceprobe_free_probe_arg() iterates only over SYMBOL and DATA op types Setup fetch string operation correctly after fetch_op_data operation. Link: https://lkml.kernel.org/r/20200615143034.GA1734@cosmos Cc: stable@vger.kernel.org Fixes: a42e3c4de964 ("tracing/probe: Add immediate string parameter support") Acked-by: Masami Hiramatsu Signed-off-by: Vamshi K Sthambamkadi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b8a928e925c7..d2867ccc6aca 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -639,8 +639,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, ret = -EINVAL; goto fail; } - if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) || - parg->count) { + if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || + code->op == FETCH_OP_DATA) || parg->count) { /* * IMM, DATA and COMM is pointing actual address, those * must be kept, and if parg->count != 0, this is an -- cgit v1.2.3 From 69243720c0932b8672e571a873c78bcf3326575a Mon Sep 17 00:00:00 2001 From: YangHui Date: Tue, 16 Jun 2020 11:36:46 +0800 Subject: tracing: Remove unused event variable in tracing_iter_reset We do not use the event variable, just remove it. Signed-off-by: YangHui Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ec44b0e2a19c..bb62269724d5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3570,7 +3570,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) void tracing_iter_reset(struct trace_iterator *iter, int cpu) { - struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; @@ -3588,7 +3587,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) * that a reset never took place on a cpu. This is evident * by the timestamp being before the start of the buffer. */ - while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { + while (ring_buffer_iter_peek(buf_iter, &ts)) { if (ts >= iter->array_buffer->time_start) break; entries++; -- cgit v1.2.3 From 1fbf57d0530217b9f264eedc4867bf91479cdf3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jun 2020 10:29:49 +0200 Subject: dma-direct: re-enable mmap for !CONFIG_MMU nommu configfs can trivially map the coherent allocations to user space, as no actual page table setup is required and the kernel and the user space programs share the same address space. Fixes: 62fcee9a3bd7 ("dma-mapping: remove CONFIG_ARCH_NO_COHERENT_DMA_MMAP") Signed-off-by: Christoph Hellwig Reported-by: dillon min Reviewed-by: Vladimir Murzin Tested-by: dillon min --- kernel/dma/Kconfig | 1 + kernel/dma/direct.c | 14 -------------- 2 files changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index a0ce3c1494fd..14ef8e1bdefe 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -71,6 +71,7 @@ config SWIOTLB # in the pagetables # config DMA_NONCOHERENT_MMAP + default y if !MMU bool config DMA_COHERENT_POOL diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 0a4881e59aa7..9ec6a5c3fc57 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -459,7 +459,6 @@ int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, return ret; } -#ifdef CONFIG_MMU bool dma_direct_can_mmap(struct device *dev) { return dev_is_dma_coherent(dev) || @@ -485,19 +484,6 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); } -#else /* CONFIG_MMU */ -bool dma_direct_can_mmap(struct device *dev) -{ - return false; -} - -int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - return -ENXIO; -} -#endif /* CONFIG_MMU */ int dma_direct_supported(struct device *dev, u64 mask) { -- cgit v1.2.3 From 26749b3201ab05e288fbf78fbc8585dfa2da3218 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Jun 2020 08:52:31 +0200 Subject: dma-direct: mark __dma_direct_alloc_pages static Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 9ec6a5c3fc57..30c41b57acd9 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -109,7 +109,7 @@ static inline bool dma_should_free_from_pool(struct device *dev, return false; } -struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, +static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { size_t alloc_size = PAGE_ALIGN(size); -- cgit v1.2.3 From 633d5fce78a61e8727674467944939f55b0bcfab Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 11 Jun 2020 12:20:28 -0700 Subject: dma-direct: always align allocation size in dma_direct_alloc_pages() dma_alloc_contiguous() does size >> PAGE_SHIFT and set_memory_decrypted() works at page granularity. It's necessary to page align the allocation size in dma_direct_alloc_pages() for consistent behavior. This also fixes an issue when arch_dma_prep_coherent() is called on an unaligned allocation size for dma_alloc_need_uncached() when CONFIG_DMA_DIRECT_REMAP is disabled but CONFIG_ARCH_HAS_DMA_SET_UNCACHED is enabled. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 30c41b57acd9..c93e3c8e3d01 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -112,11 +112,12 @@ static inline bool dma_should_free_from_pool(struct device *dev, static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { - size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); struct page *page = NULL; u64 phys_limit; + WARN_ON_ONCE(!PAGE_ALIGNED(size)); + if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; @@ -124,14 +125,14 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp &= ~__GFP_ZERO; gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); - page = dma_alloc_contiguous(dev, alloc_size, gfp); + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_free_contiguous(dev, page, alloc_size); + dma_free_contiguous(dev, page, size); page = NULL; } again: if (!page) - page = alloc_pages_node(node, gfp, get_order(alloc_size)); + page = alloc_pages_node(node, gfp, get_order(size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); page = NULL; @@ -158,8 +159,10 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; + size = PAGE_ALIGN(size); + if (dma_should_alloc_from_pool(dev, gfp, attrs)) { - ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); + ret = dma_alloc_from_pool(dev, size, &page, gfp); if (!ret) return NULL; goto done; @@ -183,10 +186,10 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, dma_alloc_need_uncached(dev, attrs)) || (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { /* remove any dirty cache lines on the kernel alias */ - arch_dma_prep_coherent(page, PAGE_ALIGN(size)); + arch_dma_prep_coherent(page, size); /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size), + ret = dma_common_contiguous_remap(page, size, dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) -- cgit v1.2.3 From 96a539fa3bb71f443ae08e57b9f63d6e5bb2207c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 11 Jun 2020 12:20:29 -0700 Subject: dma-direct: re-encrypt memory if dma_direct_alloc_pages() fails If arch_dma_set_uncached() fails after memory has been decrypted, it needs to be re-encrypted before freeing. Fixes: fa7e2247c572 ("dma-direct: make uncached_kernel_address more general") Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c93e3c8e3d01..80d33f215a2e 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -220,7 +220,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) - goto out_free_pages; + goto out_encrypt_pages; } done: if (force_dma_unencrypted(dev)) @@ -228,6 +228,11 @@ done: else *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; + +out_encrypt_pages: + if (force_dma_unencrypted(dev)) + set_memory_encrypted((unsigned long)page_address(page), + 1 << get_order(size)); out_free_pages: dma_free_contiguous(dev, page, size); return NULL; -- cgit v1.2.3 From 56fccf21d1961a06e2a0c96ce446ebf036651062 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 11 Jun 2020 12:20:30 -0700 Subject: dma-direct: check return value when encrypting or decrypting memory __change_page_attr() can fail which will cause set_memory_encrypted() and set_memory_decrypted() to return non-zero. If the device requires unencrypted DMA memory and decryption fails, simply free the memory and fail. If attempting to re-encrypt in the failure path and that encryption fails, there is no alternative other than to leak the memory. Fixes: c10f07aa27da ("dma/direct: Handle force decryption for DMA coherent buffers in common code") Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 80d33f215a2e..2f69bfdbe315 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -158,6 +158,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, { struct page *page; void *ret; + int err; size = PAGE_ALIGN(size); @@ -210,8 +211,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) - set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); + if (force_dma_unencrypted(dev)) { + err = set_memory_decrypted((unsigned long)ret, + 1 << get_order(size)); + if (err) + goto out_free_pages; + } memset(ret, 0, size); @@ -230,9 +235,13 @@ done: return ret; out_encrypt_pages: - if (force_dma_unencrypted(dev)) - set_memory_encrypted((unsigned long)page_address(page), - 1 << get_order(size)); + if (force_dma_unencrypted(dev)) { + err = set_memory_encrypted((unsigned long)page_address(page), + 1 << get_order(size)); + /* If memory cannot be re-encrypted, it must be leaked */ + if (err) + return NULL; + } out_free_pages: dma_free_contiguous(dev, page, size); return NULL; -- cgit v1.2.3 From 1b0b283648163dae2a214ca28ed5a99f62a77319 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 5 Jun 2020 16:58:36 +0200 Subject: blktrace: break out of blktrace setup on concurrent calls We use one blktrace per request_queue, that means one per the entire disk. So we cannot run one blktrace on say /dev/vda and then /dev/vda1, or just two calls on /dev/vda. We check for concurrent setup only at the very end of the blktrace setup though. If we try to run two concurrent blktraces on the same block device the second one will fail, and the first one seems to go on. However when one tries to kill the first one one will see things like this: The kernel will show these: ``` debugfs: File 'dropped' in directory 'nvme1n1' already present! debugfs: File 'msg' in directory 'nvme1n1' already present! debugfs: File 'trace0' in directory 'nvme1n1' already present! `` And userspace just sees this error message for the second call: ``` blktrace /dev/nvme1n1 BLKTRACESETUP(2) /dev/nvme1n1 failed: 5/Input/output error ``` The first userspace process #1 will also claim that the files were taken underneath their nose as well. The files are taken away form the first process given that when the second blktrace fails, it will follow up with a BLKTRACESTOP and BLKTRACETEARDOWN. This means that even if go-happy process #1 is waiting for blktrace data, we *have* been asked to take teardown the blktrace. This can easily be reproduced with break-blktrace [0] run_0005.sh test. Just break out early if we know we're already going to fail, this will prevent trying to create the files all over again, which we know still exist. [0] https://github.com/mcgrof/break-blktrace Signed-off-by: Luis Chamberlain Signed-off-by: Jan Kara Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5773f0ba7e76..8fd36e827f14 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -3,6 +3,9 @@ * Copyright (C) 2006 Jens Axboe * */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -494,6 +497,16 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, */ strreplace(buts->name, '/', '_'); + /* + * bdev can be NULL, as with scsi-generic, this is a helpful as + * we can be. + */ + if (q->blk_trace) { + pr_warn("Concurrent blktraces are not allowed on %s\n", + buts->name); + return -EBUSY; + } + bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; -- cgit v1.2.3 From c3dbe541ef77754729de5e82be2d6e5d267c6c8c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Jun 2020 16:58:37 +0200 Subject: blktrace: Avoid sparse warnings when assigning q->blk_trace Mostly for historical reasons, q->blk_trace is assigned through xchg() and cmpxchg() atomic operations. Although this is correct, sparse complains about this because it violates rcu annotations since commit c780e86dd48e ("blktrace: Protect q->blk_trace with RCU") which started to use rcu for accessing q->blk_trace. Furthermore there's no real need for atomic operations anymore since all changes to q->blk_trace happen under q->blk_trace_mutex and since it also makes more sense to check if q->blk_trace is set with the mutex held earlier. So let's just replace xchg() with rcu_replace_pointer() and cmpxchg() with explicit check and rcu_assign_pointer(). This makes the code more efficient and sparse happy. Reported-by: kbuild test robot Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8fd36e827f14..5ef0484513ec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -347,7 +347,8 @@ static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; - bt = xchg(&q->blk_trace, NULL); + bt = rcu_replace_pointer(q->blk_trace, NULL, + lockdep_is_held(&q->blk_trace_mutex)); if (!bt) return -EINVAL; @@ -501,7 +502,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ - if (q->blk_trace) { + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { pr_warn("Concurrent blktraces are not allowed on %s\n", buts->name); return -EBUSY; @@ -556,10 +558,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; - ret = -EBUSY; - if (cmpxchg(&q->blk_trace, NULL, bt)) - goto err; - + rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); ret = 0; @@ -1642,7 +1641,8 @@ static int blk_trace_remove_queue(struct request_queue *q) { struct blk_trace *bt; - bt = xchg(&q->blk_trace, NULL); + bt = rcu_replace_pointer(q->blk_trace, NULL, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -1674,10 +1674,7 @@ static int blk_trace_setup_queue(struct request_queue *q, blk_trace_setup_lba(bt, bdev); - ret = -EBUSY; - if (cmpxchg(&q->blk_trace, NULL, bt)) - goto free_bt; - + rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); return 0; -- cgit v1.2.3 From 02553b91da5deb63c8562b47529b09b734659af0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Jun 2020 22:04:30 -0700 Subject: bpf: bpf_probe_read_kernel_str() has to return amount of data read on success During recent refactorings, bpf_probe_read_kernel_str() started returning 0 on success, instead of amount of data successfully read. This majorly breaks applications relying on bpf_probe_read_kernel_str() and bpf_probe_read_str() and their results. Fix this by returning actual number of bytes read. Fixes: 8d92db5c04d1 ("bpf: rework the compat kernel probe handling") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Christoph Hellwig Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200616050432.1902042-1-andriin@fb.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e729c9e587a0..a3ac7de98baa 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -241,7 +241,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) if (unlikely(ret < 0)) goto fail; - return 0; + return ret; fail: memset(dst, 0, size); return ret; -- cgit v1.2.3 From 99c51064fb06146b3d494b745c947e438a10aaa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 16 Jun 2020 16:28:29 +0200 Subject: devmap: Use bpf_map_area_alloc() for allocating hash buckets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzkaller discovered that creating a hash of type devmap_hash with a large number of entries can hit the memory allocator limit for allocating contiguous memory regions. There's really no reason to use kmalloc_array() directly in the devmap code, so just switch it to the existing bpf_map_area_alloc() function that is used elsewhere. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Xiumei Mu Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200616142829.114173-1-toke@redhat.com --- kernel/bpf/devmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0cbb72cdaf63..5fdbc776a760 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -86,12 +86,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct hlist_head *dev_map_create_hash(unsigned int entries) +static struct hlist_head *dev_map_create_hash(unsigned int entries, + int numa_node) { int i; struct hlist_head *hash; - hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -145,7 +146,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) return -EINVAL; if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, + dtab->map.numa_node); if (!dtab->dev_index_head) goto free_charge; @@ -232,7 +234,7 @@ static void dev_map_free(struct bpf_map *map) } } - kfree(dtab->dev_index_head); + bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; -- cgit v1.2.3 From d8fe449a9c51a37d844ab607e14e2f5c657d3cf2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 16 Jun 2020 18:04:14 -0700 Subject: bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE Attaching to these hooks can break iptables because its optval is usually quite big, or at least bigger than the current PAGE_SIZE limit. David also mentioned some SCTP options can be big (around 256k). For such optvals we expose only the first PAGE_SIZE bytes to the BPF program. BPF program has two options: 1. Set ctx->optlen to 0 to indicate that the BPF's optval should be ignored and the kernel should use original userspace value. 2. Set ctx->optlen to something that's smaller than the PAGE_SIZE. v5: * use ctx->optlen == 0 with trimmed buffer (Alexei Starovoitov) * update the docs accordingly v4: * use temporary buffer to avoid optval == optval_end == NULL; this removes the corner case in the verifier that might assume non-zero PTR_TO_PACKET/PTR_TO_PACKET_END. v3: * don't increase the limit, bypass the argument v2: * proper comments formatting (Jakub Kicinski) Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: David Laight Link: https://lore.kernel.org/bpf/20200617010416.93086-1-sdf@google.com --- kernel/bpf/cgroup.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4d76f16524cc..ac53102e244a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1276,16 +1276,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) { - if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + if (unlikely(max_optlen < 0)) return -EINVAL; + if (unlikely(max_optlen > PAGE_SIZE)) { + /* We don't expose optvals that are greater than PAGE_SIZE + * to the BPF program. + */ + max_optlen = PAGE_SIZE; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; - return 0; + return max_optlen; } static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) @@ -1319,13 +1326,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - ret = sockopt_alloc_buf(&ctx, max_optlen); - if (ret) - return ret; + max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + if (max_optlen < 0) + return max_optlen; ctx.optlen = *optlen; - if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { ret = -EFAULT; goto out; } @@ -1353,8 +1360,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, /* export any potential modifications */ *level = ctx.level; *optname = ctx.optname; - *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + + /* optlen == 0 from BPF indicates that we should + * use original userspace data. + */ + if (ctx.optlen != 0) { + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } } out: @@ -1385,12 +1398,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; - ret = sockopt_alloc_buf(&ctx, max_optlen); - if (ret) - return ret; - ctx.optlen = max_optlen; + max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + if (max_optlen < 0) + return max_optlen; + if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back @@ -1404,10 +1417,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (ctx.optlen > max_optlen) - ctx.optlen = max_optlen; - - if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + if (copy_from_user(ctx.optval, optval, + min(ctx.optlen, max_optlen)) != 0) { ret = -EFAULT; goto out; } @@ -1436,10 +1447,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (copy_to_user(optval, ctx.optval, ctx.optlen) || - put_user(ctx.optlen, optlen)) { - ret = -EFAULT; - goto out; + if (ctx.optlen != 0) { + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } } ret = ctx.retval; -- cgit v1.2.3 From fe557319aa06c23cffc9346000f119547e0f289a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Jun 2020 09:37:53 +0200 Subject: maccess: rename probe_kernel_{read,write} to copy_{from,to}_kernel_nofault Better describe what these functions do. Suggested-by: Linus Torvalds Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- kernel/debug/debug_core.c | 6 +++--- kernel/debug/gdbstub.c | 6 +++--- kernel/debug/kdb/kdb_main.c | 3 ++- kernel/debug/kdb/kdb_support.c | 7 ++++--- kernel/kthread.c | 2 +- kernel/trace/bpf_trace.c | 4 ++-- kernel/trace/trace_kprobe.c | 4 ++-- kernel/workqueue.c | 10 +++++----- 8 files changed, 22 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index ccc0f98abdd4..bc8d25f2ac8a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -169,18 +169,18 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; - err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); if (err) return err; - err = probe_kernel_write((char *)bpt->bpt_addr, + err = copy_to_kernel_nofault((char *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); return err; } int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { - return probe_kernel_write((char *)bpt->bpt_addr, + return copy_to_kernel_nofault((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 4b280fc7dd67..61774aec46b4 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -247,7 +247,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count) */ tmp = buf + count; - err = probe_kernel_read(tmp, mem, count); + err = copy_from_kernel_nofault(tmp, mem, count); if (err) return NULL; while (count > 0) { @@ -283,7 +283,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count) *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; } - return probe_kernel_write(mem, tmp_raw, count); + return copy_to_kernel_nofault(mem, tmp_raw, count); } /* @@ -335,7 +335,7 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count) size++; } - return probe_kernel_write(mem, c, size); + return copy_to_kernel_nofault(mem, c, size); } #if DBG_MAX_REG_NUM > 0 diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ec190569f690..5c7949061671 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2326,7 +2326,8 @@ void kdb_ps1(const struct task_struct *p) int cpu; unsigned long tmp; - if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) + if (!p || + copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long))) return; cpu = kdb_process_cpu(p); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index b8e6306e7e13..004c5b6c87f8 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -325,7 +325,7 @@ char *kdb_strdup(const char *str, gfp_t type) */ int kdb_getarea_size(void *res, unsigned long addr, size_t size) { - int ret = probe_kernel_read((char *)res, (char *)addr, size); + int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size); if (ret) { if (!KDB_STATE(SUPPRESS)) { kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr); @@ -350,7 +350,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size) */ int kdb_putarea_size(unsigned long addr, void *res, size_t size) { - int ret = probe_kernel_read((char *)addr, (char *)res, size); + int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size); if (ret) { if (!KDB_STATE(SUPPRESS)) { kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr); @@ -624,7 +624,8 @@ char kdb_task_state_char (const struct task_struct *p) char state; unsigned long tmp; - if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) + if (!p || + copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long))) return 'E'; cpu = kdb_process_cpu(p); diff --git a/kernel/kthread.c b/kernel/kthread.c index 8e3d2d7fdf5e..132f84a5fde3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -201,7 +201,7 @@ void *kthread_probe_data(struct task_struct *task) struct kthread *kthread = to_kthread(task); void *data = NULL; - probe_kernel_read(&data, &kthread->data, sizeof(data)); + copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e729c9e587a0..204afc12425f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -196,7 +196,7 @@ bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) if (unlikely(ret < 0)) goto fail; - ret = probe_kernel_read(dst, unsafe_ptr, size); + ret = copy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) goto fail; return ret; @@ -661,7 +661,7 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, copy_size = (fmt[i + 2] == '4') ? 4 : 16; - err = probe_kernel_read(bufs->buf[memcpy_cnt], + err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt], (void *) (long) args[fmt_cnt], copy_size); if (err < 0) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6048f1be26d2..841c74863ff8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1222,7 +1222,7 @@ fetch_store_strlen(unsigned long addr) #endif do { - ret = probe_kernel_read(&c, (u8 *)addr + len, 1); + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); @@ -1300,7 +1300,7 @@ probe_mem_read(void *dest, void *src, size_t size) if ((unsigned long)src < TASK_SIZE) return probe_mem_read_user(dest, src, size); #endif - return probe_kernel_read(dest, src, size); + return copy_from_kernel_nofault(dest, src, size); } /* Note that we don't verify it, since the code does not come from user space */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9fbe1e237563..c41c3c17b86a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4638,11 +4638,11 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) * Carefully copy the associated workqueue's workfn, name and desc. * Keep the original last '\0' in case the original is garbage. */ - probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); - probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); - probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); - probe_kernel_read(name, wq->name, sizeof(name) - 1); - probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); + copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn)); + copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq)); + copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq)); + copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1); + copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %ps", log_lvl, name, fn); -- cgit v1.2.3 From c0ee37e85e0e47402b8bbe35b6cec8e06937ca58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Jun 2020 09:37:54 +0200 Subject: maccess: rename probe_user_{read,write} to copy_{from,to}_user_nofault Better describe what these functions do. Suggested-by: Linus Torvalds Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 204afc12425f..dc05626979b8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -141,7 +141,7 @@ bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) { int ret; - ret = probe_user_read(dst, unsafe_ptr, size); + ret = copy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; @@ -326,7 +326,7 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, if (unlikely(!nmi_uaccess_okay())) return -EPERM; - return probe_user_write(unsafe_ptr, src, size); + return copy_to_user_nofault(unsafe_ptr, src, size); } static const struct bpf_func_proto bpf_probe_write_user_proto = { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 841c74863ff8..aefb6065b508 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1290,7 +1290,7 @@ probe_mem_read_user(void *dest, void *src, size_t size) { const void __user *uaddr = (__force const void __user *)src; - return probe_user_read(dest, uaddr, size); + return copy_from_user_nofault(dest, uaddr, size); } static nokprobe_inline int -- cgit v1.2.3 From bbccc11bc8848926065915e6193fd4c6e33c85ef Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 24 May 2020 15:52:38 -0500 Subject: audit: Use struct_size() helper in alloc_chunk One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct audit_chunk { ... struct node { struct list_head list; struct audit_tree *owner; unsigned index; /* index; upper bit indicates 'will prune' */ } owners[]; }; Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. So, replace the following form: offsetof(struct audit_chunk, owners) + count * sizeof(struct node); with: struct_size(chunk, owners, count) This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Paul Moore --- kernel/audit_tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e49c912f862d..1b7a2f041793 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void) static struct audit_chunk *alloc_chunk(int count) { struct audit_chunk *chunk; - size_t size; int i; - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); + chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL); if (!chunk) return NULL; -- cgit v1.2.3 From 026bb845b0fff6dec91fe24511dad7d3067dc3ed Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Fri, 29 May 2020 22:12:14 +0800 Subject: ftrace: Fix maybe-uninitialized compiler warning During build compiler reports some 'false positive' warnings about variables {'seq_ops', 'filtered_pids', 'other_pids'} may be used uninitialized. This patch silences these warnings. Also delete some useless spaces Link: https://lkml.kernel.org/r/20200529141214.37648-1-pilgrimtao@gmail.com Signed-off-by: Kaitao Cheng Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c163c3531faf..1903b80db6eb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2260,7 +2260,7 @@ ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, if (hash_contains_ip(ip, op->func_hash)) return op; - } + } return NULL; } @@ -3599,7 +3599,7 @@ static int t_show(struct seq_file *m, void *v) if (direct) seq_printf(m, "\n\tdirect-->%pS", (void *)direct); } - } + } seq_putc(m, '\n'); @@ -7151,6 +7151,10 @@ static int pid_open(struct inode *inode, struct file *file, int type) case TRACE_NO_PIDS: seq_ops = &ftrace_no_pid_sops; break; + default: + trace_array_put(tr); + WARN_ON_ONCE(1); + return -EINVAL; } ret = seq_open(file, seq_ops); @@ -7229,6 +7233,10 @@ pid_write(struct file *filp, const char __user *ubuf, other_pids = rcu_dereference_protected(tr->function_pids, lockdep_is_held(&ftrace_lock)); break; + default: + ret = -EINVAL; + WARN_ON_ONCE(1); + goto out; } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); -- cgit v1.2.3 From e04ec0de61c1eb9693179093e83ab8ca68a30d08 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Mon, 8 Jun 2020 17:26:52 -0400 Subject: padata: upgrade smp_mb__after_atomic to smp_mb in padata_do_serial A 5.7 kernel hangs during a tcrypt test of padata that waits for an AEAD request to finish. This is only seen on large machines running many concurrent requests. The issue is that padata never serializes the request. The removal of the reorder_objects atomic missed that the memory barrier in padata_do_serial() depends on it. Upgrade the barrier from smp_mb__after_atomic to smp_mb to get correct ordering again. Fixes: 3facced7aeed1 ("padata: remove reorder_objects") Signed-off-by: Daniel Jordan Cc: Steffen Klassert Cc: linux-kernel@vger.kernel.org Cc: Signed-off-by: Herbert Xu --- kernel/padata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 29fc5d87a4cd..4373f7adaa40 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -335,7 +335,7 @@ static void padata_reorder(struct parallel_data *pd) * * Ensure reorder queue is read after pd->lock is dropped so we see * new objects from another task in padata_do_serial. Pairs with - * smp_mb__after_atomic in padata_do_serial. + * smp_mb in padata_do_serial. */ smp_mb(); @@ -418,7 +418,7 @@ void padata_do_serial(struct padata_priv *padata) * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb * in padata_reorder. */ - smp_mb__after_atomic(); + smp_mb(); padata_reorder(pd); } -- cgit v1.2.3 From 6c6935419e2fd8ef1fcde71c4e50b9095e520900 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 18 Jun 2020 16:46:31 -0700 Subject: bpf: Avoid verifier failure for 32bit pointer arithmetic When do experiments with llvm (disabling instcombine and simplifyCFG), I hit the following error with test_seg6_loop.o. ; R1=pkt(id=0,off=0,r=48,imm=0), R7=pkt(id=0,off=40,r=48,imm=0) w2 = w7 ; R2_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) w2 -= w1 R2 32-bit pointer arithmetic prohibited The corresponding source code is: uint32_t srh_off // srh and skb->data are all packet pointers srh_off = (char *)srh - (char *)(long)skb->data; The verifier does not support 32-bit pointer/scalar arithmetic. Without my llvm change, the code looks like ; R3=pkt(id=0,off=40,r=48,imm=0), R8=pkt(id=0,off=0,r=48,imm=0) w3 -= w8 ; R3_w=inv(id=0) This is explicitly allowed in verifier if both registers are pointers and the opcode is BPF_SUB. To fix this problem, I changed the verifier to allow 32-bit pointer/scaler BPF_SUB operations. At the source level, the issue could be workarounded with inline asm or changing "uint32_t srh_off" to "uint64_t srh_off". But I feel that verifier change might be the right thing to do. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200618234631.3321118-1-yhs@fb.com --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34cde841ab68..a1857c4ffaaf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5031,6 +5031,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { + __mark_reg_unknown(env, dst_reg); + return 0; + } + verbose(env, "R%d 32-bit pointer arithmetic prohibited\n", dst); -- cgit v1.2.3 From 625d3449788f85569096780592549d0340e9c0c7 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 21 Jun 2020 21:02:22 -0600 Subject: Revert "kernel/printk: add kmsg SEEK_CUR handling" This reverts commit 8ece3b3eb576a78d2e67ad4c3a80a39fa6708809. This commit broke userspace. Bash uses ESPIPE to determine whether or not the file should be read using "unbuffered I/O", which means reading 1 byte at a time instead of 128 bytes at a time. I used to use bash to read through kmsg in a really quite nasty way: while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do echo "SARU $line" done < /dev/kmsg This will show all lines that can fit into the 128 byte buffer, and skip lines that don't. That's pretty awful, but at least it worked. With this change, bash now tries to do 1-byte reads, which means it skips all the lines, which is worse than before. Now, I don't really care very much about this, and I'm already look for a workaround. But I did just spend an hour trying to figure out why my scripts were broken. Either way, it makes no difference to me personally whether this is reverted, but it might be something to consider. If you declare that "trying to read /dev/kmsg with bash is terminally stupid anyway," I might be inclined to agree with you. But do note that bash uses lseek(fd, 0, SEEK_CUR)==>ESPIPE to determine whether or not it's reading from a pipe. Cc: Bruno Meneguele Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: David Laight Cc: Sergey Senozhatsky Cc: Petr Mladek Signed-off-by: Jason A. Donenfeld Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8c14835be46c..b71eaf5f5a86 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -974,16 +974,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) user->idx = log_next_idx; user->seq = log_next_seq; break; - case SEEK_CUR: - /* - * It isn't supported due to the record nature of this - * interface: _SET _DATA and _END point to very specific - * record positions, while _CUR would be more useful in case - * of a byte-based log. Because of that, return the default - * errno value for invalid seek operation. - */ - ret = -ESPIPE; - break; default: ret = -EINVAL; } -- cgit v1.2.3 From 3af8588c77186bf08e55e7281da83d88373481d7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 8 Jun 2020 17:28:50 +0200 Subject: fork: fold legacy_clone_args_valid() into _do_fork() This separate helper only existed to guarantee the mutual exclusivity of CLONE_PIDFD and CLONE_PARENT_SETTID for legacy clone since CLONE_PIDFD abuses the parent_tid field to return the pidfd. But we can actually handle this uniformely thus removing the helper. For legacy clone we can detect that CLONE_PIDFD is specified in conjunction with CLONE_PARENT_SETTID because they will share the same memory which is invalid and for clone3() setting the separate pidfd and parent_tid fields to the same memory is bogus as well. So fold that helper directly into _do_fork() by detecting this case. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Al Viro Cc: Geert Uytterhoeven Cc: "Matthew Wilcox (Oracle)" Cc: "Peter Zijlstra (Intel)" Cc: linux-m68k@lists.linux-m68k.org Cc: x86@kernel.org Signed-off-by: Christian Brauner --- kernel/fork.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..9875aeb2ba41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2422,6 +2422,20 @@ long _do_fork(struct kernel_clone_args *args) int trace = 0; long nr; + /* + * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument + * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are + * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate + * field in struct clone_args and it still doesn't make sense to have + * them both point at the same memory location. Performing this check + * here has the advantage that we don't need to have a separate helper + * to check for legacy clone(). + */ + if ((args->flags & CLONE_PIDFD) && + (args->flags & CLONE_PARENT_SETTID) && + (args->pidfd == args->parent_tid)) + return -EINVAL; + /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly @@ -2479,16 +2493,6 @@ long _do_fork(struct kernel_clone_args *args) return nr; } -bool legacy_clone_args_valid(const struct kernel_clone_args *kargs) -{ - /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ - if ((kargs->flags & CLONE_PIDFD) && - (kargs->flags & CLONE_PARENT_SETTID)) - return false; - - return true; -} - #ifndef CONFIG_HAVE_COPY_THREAD_TLS /* For compatibility with architectures that call do_fork directly rather than * using the syscall entry points below. */ @@ -2508,9 +2512,6 @@ long do_fork(unsigned long clone_flags, .stack_size = stack_size, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif @@ -2593,9 +2594,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif -- cgit v1.2.3 From a2d0d62f4d9e3546134ba08e102ca7decd4ed836 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 19 Jun 2020 14:11:41 -0700 Subject: bpf: Switch btf_parse_vmlinux to btf_find_by_name_kind btf_parse_vmlinux() implements manual search for struct bpf_ctx_convert since at the time of implementing btf_find_by_name_kind() was not available. Later btf_find_by_name_kind() was introduced in 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS"). It provides similar search functionality and can be leveraged in btf_parse_vmlinux(). Do it. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/6e12d5c3e8a3d552925913ef73a695dd1bb27800.1592600985.git.rdna@fb.com --- kernel/bpf/btf.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58c9af1d4808..3eb804618a53 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3591,7 +3591,7 @@ struct btf *btf_parse_vmlinux(void) struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err, i; + int err, btf_id; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3625,24 +3625,13 @@ struct btf *btf_parse_vmlinux(void) goto errout; /* find struct bpf_ctx_convert for type checking later */ - for (i = 1; i <= btf->nr_types; i++) { - const struct btf_type *t; - const char *tname; - - t = btf_type_by_id(btf, i); - if (!__btf_type_is_struct(t)) - continue; - tname = __btf_name_by_offset(btf, t->name_off); - if (!strcmp(tname, "bpf_ctx_convert")) { - /* btf_parse_vmlinux() runs under bpf_verifier_lock */ - bpf_ctx_convert.t = t; - break; - } - } - if (i > btf->nr_types) { - err = -ENOENT; + btf_id = btf_find_by_name_kind(btf, "bpf_ctx_convert", BTF_KIND_STRUCT); + if (btf_id < 0) { + err = btf_id; goto errout; } + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ + bpf_ctx_convert.t = btf_type_by_id(btf, btf_id); bpf_struct_ops_init(btf, log); -- cgit v1.2.3 From 41c48f3a98231738c5ce79f6f2aa6e40ba924d18 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 19 Jun 2020 14:11:43 -0700 Subject: bpf: Support access to bpf map fields There are multiple use-cases when it's convenient to have access to bpf map fields, both `struct bpf_map` and map type specific struct-s such as `struct bpf_array`, `struct bpf_htab`, etc. For example while working with sock arrays it can be necessary to calculate the key based on map->max_entries (some_hash % max_entries). Currently this is solved by communicating max_entries via "out-of-band" channel, e.g. via additional map with known key to get info about target map. That works, but is not very convenient and error-prone while working with many maps. In other cases necessary data is dynamic (i.e. unknown at loading time) and it's impossible to get it at all. For example while working with a hash table it can be convenient to know how much capacity is already used (bpf_htab.count.counter for BPF_F_NO_PREALLOC case). At the same time kernel knows this info and can provide it to bpf program. Fill this gap by adding support to access bpf map fields from bpf program for both `struct bpf_map` and map type specific fields. Support is implemented via btf_struct_access() so that a user can define their own `struct bpf_map` or map type specific struct in their program with only necessary fields and preserve_access_index attribute, cast a map to this struct and use a field. For example: struct bpf_map { __u32 max_entries; } __attribute__((preserve_access_index)); struct bpf_array { struct bpf_map map; __u32 elem_size; } __attribute__((preserve_access_index)); struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 4); __type(key, __u32); __type(value, __u32); } m_array SEC(".maps"); SEC("cgroup_skb/egress") int cg_skb(void *ctx) { struct bpf_array *array = (struct bpf_array *)&m_array; struct bpf_map *map = (struct bpf_map *)&m_array; /* .. use map->max_entries or array->map.max_entries .. */ } Similarly to other btf_struct_access() use-cases (e.g. struct tcp_sock in net/ipv4/bpf_tcp_ca.c) the patch allows access to any fields of corresponding struct. Only reading from map fields is supported. For btf_struct_access() to work there should be a way to know btf id of a struct that corresponds to a map type. To get btf id there should be a way to get a stringified name of map-specific struct, such as "bpf_array", "bpf_htab", etc for a map type. Two new fields are added to `struct bpf_map_ops` to handle it: * .map_btf_name keeps a btf name of a struct returned by map_alloc(); * .map_btf_id is used to cache btf id of that struct. To make btf ids calculation cheaper they're calculated once while preparing btf_vmlinux and cached same way as it's done for btf_id field of `struct bpf_func_proto` While calculating btf ids, struct names are NOT checked for collision. Collisions will be checked as a part of the work to prepare btf ids used in verifier in compile time that should land soon. The only known collision for `struct bpf_htab` (kernel/bpf/hashtab.c vs net/core/sock_map.c) was fixed earlier. Both new fields .map_btf_name and .map_btf_id must be set for a map type for the feature to work. If neither is set for a map type, verifier will return ENOTSUPP on a try to access map_ptr of corresponding type. If just one of them set, it's verifier misconfiguration. Only `struct bpf_array` for BPF_MAP_TYPE_ARRAY and `struct bpf_htab` for BPF_MAP_TYPE_HASH are supported by this patch. Other map types will be supported separately. The feature is available only for CONFIG_DEBUG_INFO_BTF=y and gated by perfmon_capable() so that unpriv programs won't have access to bpf map fields. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/6479686a0cd1e9067993df57b4c3eef0e276fec9.1592600985.git.rdna@fb.com --- kernel/bpf/arraymap.c | 3 ++ kernel/bpf/btf.c | 40 +++++++++++++++++++++++++ kernel/bpf/hashtab.c | 3 ++ kernel/bpf/verifier.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 120 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 11584618e861..e7caa48812fb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -494,6 +494,7 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -510,6 +511,8 @@ const struct bpf_map_ops array_map_ops = { .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, + .map_btf_name = "bpf_array", + .map_btf_id = &array_map_btf_id, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3eb804618a53..e377d1981730 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3571,6 +3571,41 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, return ctx_type; } +static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_LINK_TYPE(_id, _name) +#define BPF_MAP_TYPE(_id, _ops) \ + [_id] = &_ops, +#include +#undef BPF_PROG_TYPE +#undef BPF_LINK_TYPE +#undef BPF_MAP_TYPE +}; + +static int btf_vmlinux_map_ids_init(const struct btf *btf, + struct bpf_verifier_log *log) +{ + const struct bpf_map_ops *ops; + int i, btf_id; + + for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) { + ops = btf_vmlinux_map_ops[i]; + if (!ops || (!ops->map_btf_name && !ops->map_btf_id)) + continue; + if (!ops->map_btf_name || !ops->map_btf_id) { + bpf_log(log, "map type %d is misconfigured\n", i); + return -EINVAL; + } + btf_id = btf_find_by_name_kind(btf, ops->map_btf_name, + BTF_KIND_STRUCT); + if (btf_id < 0) + return btf_id; + *ops->map_btf_id = btf_id; + } + + return 0; +} + static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -3633,6 +3668,11 @@ struct btf *btf_parse_vmlinux(void) /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, btf_id); + /* find bpf map structs for map_ptr access checking */ + err = btf_vmlinux_map_ids_init(btf, log); + if (err < 0) + goto errout; + bpf_struct_ops_init(btf, log); btf_verifier_env_free(env); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b4b288a3c3c9..2c5999e02060 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1614,6 +1614,7 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, true, false); } +static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1625,6 +1626,8 @@ const struct bpf_map_ops htab_map_ops = { .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, BATCH_OPS(htab), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_map_btf_id, }; const struct bpf_map_ops htab_lru_map_ops = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a1857c4ffaaf..7460f967cb75 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1351,6 +1351,19 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(env, regs + regno); } +static void mark_btf_ld_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno, + enum bpf_reg_type reg_type, u32 btf_id) +{ + if (reg_type == SCALAR_VALUE) { + mark_reg_unknown(env, regs, regno); + return; + } + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_BTF_ID; + regs[regno].btf_id = btf_id; +} + #define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) @@ -3182,19 +3195,68 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - if (atype == BPF_READ && value_regno >= 0) { - if (ret == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - return 0; - } - mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].type = PTR_TO_BTF_ID; - regs[value_regno].btf_id = btf_id; + if (atype == BPF_READ && value_regno >= 0) + mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + + return 0; +} + +static int check_ptr_to_map_access(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + int regno, int off, int size, + enum bpf_access_type atype, + int value_regno) +{ + struct bpf_reg_state *reg = regs + regno; + struct bpf_map *map = reg->map_ptr; + const struct btf_type *t; + const char *tname; + u32 btf_id; + int ret; + + if (!btf_vmlinux) { + verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n"); + return -ENOTSUPP; + } + + if (!map->ops->map_btf_id || !*map->ops->map_btf_id) { + verbose(env, "map_ptr access not supported for map type %d\n", + map->map_type); + return -ENOTSUPP; + } + + t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + + if (!env->allow_ptr_to_map_access) { + verbose(env, + "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + tname); + return -EPERM; } + if (off < 0) { + verbose(env, "R%d is %s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + + if (atype != BPF_READ) { + verbose(env, "only read from %s is supported\n", tname); + return -EACCES; + } + + ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (ret < 0) + return ret; + + if (value_regno >= 0) + mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + return 0; } + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -3363,6 +3425,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_BTF_ID) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); + } else if (reg->type == CONST_PTR_TO_MAP) { + err = check_ptr_to_map_access(env, regs, regno, off, size, t, + value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -10951,6 +11016,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = false; env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); -- cgit v1.2.3 From 2872e9ac33a4440173418147351ed4f93177e763 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 19 Jun 2020 14:11:44 -0700 Subject: bpf: Set map_btf_{name, id} for all map types Set map_btf_name and map_btf_id for all map types so that map fields can be accessed by bpf programs. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/a825f808f22af52b018dbe82f1c7d29dab5fc978.1592600985.git.rdna@fb.com --- kernel/bpf/arraymap.c | 15 +++++++++++++++ kernel/bpf/bpf_struct_ops.c | 3 +++ kernel/bpf/cpumap.c | 3 +++ kernel/bpf/devmap.c | 6 ++++++ kernel/bpf/hashtab.c | 12 ++++++++++++ kernel/bpf/local_storage.c | 3 +++ kernel/bpf/lpm_trie.c | 3 +++ kernel/bpf/queue_stack_maps.c | 6 ++++++ kernel/bpf/reuseport_array.c | 3 +++ kernel/bpf/ringbuf.c | 3 +++ kernel/bpf/stackmap.c | 3 +++ 11 files changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e7caa48812fb..ec5cd11032aa 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -515,6 +515,7 @@ const struct bpf_map_ops array_map_ops = { .map_btf_id = &array_map_btf_id, }; +static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -525,6 +526,8 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &percpu_array_map_btf_id, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -871,6 +874,7 @@ static void prog_array_map_free(struct bpf_map *map) fd_array_map_free(map); } +static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, @@ -886,6 +890,8 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, + .map_btf_name = "bpf_array", + .map_btf_id = &prog_array_map_btf_id, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -964,6 +970,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -975,6 +982,8 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &perf_event_array_map_btf_id, }; #ifdef CONFIG_CGROUPS @@ -997,6 +1006,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } +static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -1007,6 +1017,8 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &cgroup_array_map_btf_id, }; #endif @@ -1080,6 +1092,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } +static int array_of_maps_map_btf_id; const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, @@ -1092,4 +1105,6 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &array_of_maps_map_btf_id, }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c6b0decaa46a..969c5d47f81f 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -611,6 +611,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } +static int bpf_struct_ops_map_btf_id; const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, @@ -620,6 +621,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, + .map_btf_name = "bpf_struct_ops_map", + .map_btf_id = &bpf_struct_ops_map_btf_id, }; /* "const void *" because some subsystem is diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 27595fc6da56..bd8658055c16 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -543,6 +543,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, @@ -551,6 +552,8 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_cpu_map", + .map_btf_id = &cpu_map_btf_id, }; static int bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0cbb72cdaf63..58acc46861ef 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -747,6 +747,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -755,8 +756,11 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_dtab", + .map_btf_id = &dev_map_btf_id, }; +static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -765,6 +769,8 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_dtab", + .map_btf_id = &dev_map_hash_map_btf_id, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2c5999e02060..acd06081d81d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1630,6 +1630,7 @@ const struct bpf_map_ops htab_map_ops = { .map_btf_id = &htab_map_btf_id, }; +static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1642,6 +1643,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, BATCH_OPS(htab_lru), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_lru_map_btf_id, }; /* Called from eBPF program */ @@ -1746,6 +1749,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1756,8 +1760,11 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, BATCH_OPS(htab_percpu), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_percpu_map_btf_id, }; +static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1768,6 +1775,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, BATCH_OPS(htab_lru_percpu), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_lru_percpu_map_btf_id, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) @@ -1890,6 +1899,7 @@ static void htab_of_map_free(struct bpf_map *map) fd_htab_map_free(map); } +static int htab_of_maps_map_btf_id; const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, @@ -1902,4 +1912,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_of_maps_map_btf_id, }; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 33d01866bcc2..51bd5a8cb01b 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -409,6 +409,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, rcu_read_unlock(); } +static int cgroup_storage_map_btf_id; const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -418,6 +419,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, + .map_btf_name = "bpf_cgroup_storage_map", + .map_btf_id = &cgroup_storage_map_btf_id, }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index c8cc4e4cf98d..1abd4e3f906d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -735,6 +735,7 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } +static int trie_map_btf_id; const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -743,4 +744,6 @@ const struct bpf_map_ops trie_map_ops = { .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, .map_check_btf = trie_check_btf, + .map_btf_name = "lpm_trie", + .map_btf_id = &trie_map_btf_id, }; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 05c8e043b9d2..80c66a6d7c54 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -262,6 +262,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } +static int queue_map_btf_id; const struct bpf_map_ops queue_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, @@ -273,8 +274,11 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_btf_name = "bpf_queue_stack", + .map_btf_id = &queue_map_btf_id, }; +static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, @@ -286,4 +290,6 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_btf_name = "bpf_queue_stack", + .map_btf_id = &stack_map_btf_id, }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 21cde24386db..a09922f656e4 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -345,6 +345,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } +static int reuseport_array_map_btf_id; const struct bpf_map_ops reuseport_array_ops = { .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, @@ -352,4 +353,6 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, + .map_btf_name = "reuseport_array", + .map_btf_id = &reuseport_array_map_btf_id, }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 180414bb0d3e..dbf37aff4827 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -294,6 +294,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } +static int ringbuf_map_btf_id; const struct bpf_map_ops ringbuf_map_ops = { .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, @@ -303,6 +304,8 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_name = "bpf_ringbuf_map", + .map_btf_id = &ringbuf_map_btf_id, }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 599488f25e40..27dc9b1b08a5 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -613,6 +613,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } +static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, @@ -621,6 +622,8 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_stack_map", + .map_btf_id = &stack_trace_map_btf_id, }; static int __init stack_map_init(void) -- cgit v1.2.3 From 1a2b3357e860d890f8045367b179c7e7e802cd71 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 11 Jun 2020 12:20:32 -0700 Subject: dma-direct: add missing set_memory_decrypted() for coherent mapping When a coherent mapping is created in dma_direct_alloc_pages(), it needs to be decrypted if the device requires unencrypted DMA before returning. Fixes: 3acac065508f ("dma-mapping: merge the generic remapping helpers into dma-direct") Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 2f69bfdbe315..93f578a8e613 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -195,6 +195,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, __builtin_return_address(0)); if (!ret) goto out_free_pages; + if (force_dma_unencrypted(dev)) { + err = set_memory_decrypted((unsigned long)ret, + 1 << get_order(size)); + if (err) + goto out_free_pages; + } memset(ret, 0, size); goto done; } -- cgit v1.2.3 From d07ae4c486908615ab336b987c7c367d132fd844 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Jun 2020 17:23:31 +0200 Subject: dma-mapping: DMA_COHERENT_POOL should select GENERIC_ALLOCATOR The dma coherent pool code needs genalloc. Move the select over from DMA_REMAP, which doesn't actually need it. Fixes: dbed452a078d ("dma-pool: decouple DMA_REMAP from DMA_COHERENT_POOL") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Acked-by: David Rientjes --- kernel/dma/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 14ef8e1bdefe..1da3f44f2565 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -75,12 +75,12 @@ config DMA_NONCOHERENT_MMAP bool config DMA_COHERENT_POOL + select GENERIC_ALLOCATOR bool config DMA_REMAP bool depends on MMU - select GENERIC_ALLOCATOR select DMA_NONCOHERENT_MMAP config DMA_DIRECT_REMAP -- cgit v1.2.3 From 8e36baf97b252cdcafa53589e8227cbb1e85f0b0 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 23 Jun 2020 14:07:55 +0200 Subject: dma-remap: align the size in dma_common_*_remap() Running a guest with a virtio-iommu protecting virtio devices is broken since commit 515e5b6d90d4 ("dma-mapping: use vmap insted of reimplementing it"). Before the conversion, the size was page aligned in __get_vm_area_node(). Doing so fixes the regression. Fixes: 515e5b6d90d4 ("dma-mapping: use vmap insted of reimplementing it") Signed-off-by: Eric Auger Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index e739a6eea6e7..78b23f089cf1 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -24,7 +24,8 @@ void *dma_common_pages_remap(struct page **pages, size_t size, { void *vaddr; - vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + vaddr = vmap(pages, PAGE_ALIGN(size) >> PAGE_SHIFT, + VM_DMA_COHERENT, prot); if (vaddr) find_vm_area(vaddr)->pages = pages; return vaddr; @@ -37,7 +38,7 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int count = size >> PAGE_SHIFT; + int count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct page **pages; void *vaddr; int i; -- cgit v1.2.3 From 097350d1c6e1f5808cae142006f18a0bbc57018d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 22 Jun 2020 15:18:15 -0400 Subject: ring-buffer: Zero out time extend if it is nested and not absolute Currently the ring buffer makes events that happen in interrupts that preempt another event have a delta of zero. (Hopefully we can change this soon). But this is to deal with the races of updating a global counter with lockless and nesting functions updating deltas. With the addition of absolute time stamps, the time extend didn't follow this rule. A time extend can happen if two events happen longer than 2^27 nanoseconds appart, as the delta time field in each event is only 27 bits. If that happens, then a time extend is injected with 2^59 bits of nanoseconds to use (18 years). But if the 2^27 nanoseconds happen between two events, and as it is writing the event, an interrupt triggers, it will see the 2^27 difference as well and inject a time extend of its own. But a recent change made the time extend logic not take into account the nesting, and this can cause two time extend deltas to happen moving the time stamp much further ahead than the current time. This gets all reset when the ring buffer moves to the next page, but that can cause time to appear to go backwards. This was observed in a trace-cmd recording, and since the data is saved in a file, with trace-cmd report --debug, it was possible to see that this indeed did happen! bash-52501 110d... 81778.908247: sched_switch: bash:52501 [120] S ==> swapper/110:0 [120] [12770284:0x2e8:64] -0 110d... 81778.908757: sched_switch: swapper/110:0 [120] R ==> bash:52501 [120] [509947:0x32c:64] TIME EXTEND: delta:306454770 length:0 bash-52501 110.... 81779.215212: sched_swap_numa: src_pid=52501 src_tgid=52388 src_ngid=52501 src_cpu=110 src_nid=2 dst_pid=52509 dst_tgid=52388 dst_ngid=52501 dst_cpu=49 dst_nid=1 [0:0x378:48] TIME EXTEND: delta:306458165 length:0 bash-52501 110dNh. 81779.521670: sched_wakeup: migration/110:565 [0] success=1 CPU:110 [0:0x3b4:40] and at the next page, caused the time to go backwards: bash-52504 110d... 81779.685411: sched_switch: bash:52504 [120] S ==> swapper/110:0 [120] [8347057:0xfb4:64] CPU:110 [SUBBUFFER START] [81779379165886:0x1320000] -0 110dN.. 81779.379166: sched_wakeup: bash:52504 [120] success=1 CPU:110 [0:0x10:40] -0 110d... 81779.379167: sched_switch: swapper/110:0 [120] R ==> bash:52504 [120] [1168:0x3c:64] Link: https://lkml.kernel.org/r/20200622151815.345d1bf5@oasis.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Tom Zanussi Cc: stable@vger.kernel.org Fixes: dc4e2801d400b ("ring-buffer: Redefine the unimplemented RINGBUF_TYPE_TIME_STAMP") Reported-by: Julia Lawall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b8e1ca48be50..00867ff82412 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2427,7 +2427,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) { bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); - event = rb_add_time_stamp(event, info->delta, abs); + event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); length -= RB_LEN_TIME_EXTEND; delta = 0; } -- cgit v1.2.3 From 8e6cf365e1d5c70e275a77a3c5ad7e3dc685474c Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 4 Jun 2020 09:20:49 -0400 Subject: audit: log nftables configuration change events iptables, ip6tables, arptables and ebtables table registration, replacement and unregistration configuration events are logged for the native (legacy) iptables setsockopt api, but not for the nftables netlink api which is used by the nft-variant of iptables in addition to nftables itself. Add calls to log the configuration actions in the nftables netlink api. This uses the same NETFILTER_CFG record format but overloads the table field. type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.878:162) : table=?:0;?:0 family=unspecified entries=2 op=nft_register_gen pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.878:162) : table=firewalld:1;?:0 family=inet entries=0 op=nft_register_table pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;filter_FORWARD:85 family=inet entries=8 op=nft_register_chain pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;filter_FORWARD:85 family=inet entries=101 op=nft_register_rule pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;__set0:87 family=inet entries=87 op=nft_register_setelem pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;__set0:87 family=inet entries=0 op=nft_register_set pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld For further information please see issue https://github.com/linux-audit/audit-kernel/issues/124 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 468a23390457..3a9100e95fda 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -75,6 +75,7 @@ #include #include #include +#include #include "audit.h" @@ -136,9 +137,26 @@ struct audit_nfcfgop_tab { }; static const struct audit_nfcfgop_tab audit_nfcfgs[] = { - { AUDIT_XT_OP_REGISTER, "register" }, - { AUDIT_XT_OP_REPLACE, "replace" }, - { AUDIT_XT_OP_UNREGISTER, "unregister" }, + { AUDIT_XT_OP_REGISTER, "xt_register" }, + { AUDIT_XT_OP_REPLACE, "xt_replace" }, + { AUDIT_XT_OP_UNREGISTER, "xt_unregister" }, + { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" }, + { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" }, + { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" }, + { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" }, + { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" }, + { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" }, + { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" }, + { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" }, + { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" }, + { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" }, + { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" }, + { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" }, + { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" }, + { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, + { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, + { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, + { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) -- cgit v1.2.3 From c4c0bdc0d2d084ed847c7066bdf59fe2cd25aa17 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 17:10:54 -0700 Subject: bpf: Set the number of exception entries properly for subprograms Currently, if a bpf program has more than one subprograms, each program will be jitted separately. For programs with bpf-to-bpf calls the prog->aux->num_exentries is not setup properly. For example, with bpf_iter_netlink.c modified to force one function to be not inlined and with CONFIG_BPF_JIT_ALWAYS_ON the following error is seen: $ ./test_progs -n 3/3 ... libbpf: failed to load program 'iter/netlink' libbpf: failed to load object 'bpf_iter_netlink' libbpf: failed to load BPF skeleton 'bpf_iter_netlink': -4007 test_netlink:FAIL:bpf_iter_netlink__open_and_load skeleton open_and_load failed #3/3 netlink:FAIL The dmesg shows the following errors: ex gen bug which is triggered by the following code in arch/x86/net/bpf_jit_comp.c: if (excnt >= bpf_prog->aux->num_exentries) { pr_err("ex gen bug\n"); return -EFAULT; } This patch fixes the issue by computing proper num_exentries for each subprogram before calling JIT. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34cde841ab68..8911d0576399 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9801,7 +9801,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) int i, j, subprog_start, subprog_end = 0, len, subprog; struct bpf_insn *insn; void *old_bpf_func; - int err; + int err, num_exentries; if (env->subprog_cnt <= 1) return 0; @@ -9876,6 +9876,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + num_exentries = 0; + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + BPF_MODE(insn->code) == BPF_PROBE_MEM) + num_exentries++; + } + func[i]->aux->num_exentries = num_exentries; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; -- cgit v1.2.3 From b338cb921e6739ff59ce32f43342779fe5ffa732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Sat, 20 Jun 2020 14:26:16 -0700 Subject: bpf: Restore behaviour of CAP_SYS_ADMIN allowing the loading of networking bpf programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a fix for a regression in commit 2c78ee898d8f ("bpf: Implement CAP_BPF"). Before the above commit it was possible to load network bpf programs with just the CAP_SYS_ADMIN privilege. The Android bpfloader happens to run in such a configuration (it has SYS_ADMIN but not NET_ADMIN) and creates maps and loads bpf programs for later use by Android's netd (which has NET_ADMIN but not SYS_ADMIN). Fixes: 2c78ee898d8f ("bpf: Implement CAP_BPF") Reported-by: John Stultz Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov Tested-by: John Stultz Link: https://lore.kernel.org/bpf/20200620212616.93894-1-zenczykowski@gmail.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..7d946435587d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2121,7 +2121,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) !bpf_capable()) return -EPERM; - if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; -- cgit v1.2.3 From 6c95503c292610ff2898b4271c510c16efdcd4e1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 20 Jun 2020 12:45:54 +0900 Subject: tracing/boot: Fix config dependency for synthedic event Since commit 726721a51838 ("tracing: Move synthetic events to a separate file") decoupled synthetic event from histogram, boot-time tracing also has to check CONFIG_SYNTH_EVENT instead of CONFIG_HIST_TRIGGERS. Link: http://lkml.kernel.org/r/159262475441.185015.5300725180746017555.stgit@devnote2 Fixes: 726721a51838 ("tracing: Move synthetic events to a separate file") Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 9de29bb45a27..8b5490cb02bb 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -120,7 +120,7 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) } #endif -#ifdef CONFIG_HIST_TRIGGERS +#ifdef CONFIG_SYNTH_EVENTS static int __init trace_boot_add_synth_event(struct xbc_node *node, const char *event) { -- cgit v1.2.3 From 6784beada631800f2c5afd567e5628c843362cee Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 20 Jun 2020 12:46:03 +0900 Subject: tracing: Fix event trigger to accept redundant spaces Fix the event trigger to accept redundant spaces in the trigger input. For example, these return -EINVAL echo " traceon" > events/ftrace/print/trigger echo "traceon if common_pid == 0" > events/ftrace/print/trigger echo "disable_event:kmem:kmalloc " > events/ftrace/print/trigger But these are hard to find what is wrong. To fix this issue, use skip_spaces() to remove spaces in front of actual tokens, and set NULL if there is no token. Link: http://lkml.kernel.org/r/159262476352.185015.5261566783045364186.stgit@devnote2 Cc: Tom Zanussi Cc: stable@vger.kernel.org Fixes: 85f2b08268c0 ("tracing: Add basic event trigger framework") Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_trigger.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 3a74736da363..f725802160c0 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -216,11 +216,17 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) int trigger_process_regex(struct trace_event_file *file, char *buff) { - char *command, *next = buff; + char *command, *next; struct event_command *p; int ret = -EINVAL; + next = buff = skip_spaces(buff); command = strsep(&next, ": \t"); + if (next) { + next = skip_spaces(next); + if (!*next) + next = NULL; + } command = (command[0] != '!') ? command : command + 1; mutex_lock(&trigger_cmd_mutex); @@ -630,8 +636,14 @@ event_trigger_callback(struct event_command *cmd_ops, int ret; /* separate the trigger from the filter (t:n [if filter]) */ - if (param && isdigit(param[0])) + if (param && isdigit(param[0])) { trigger = strsep(¶m, " \t"); + if (param) { + param = skip_spaces(param); + if (!*param) + param = NULL; + } + } trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); @@ -1368,6 +1380,11 @@ int event_enable_trigger_func(struct event_command *cmd_ops, trigger = strsep(¶m, " \t"); if (!trigger) return -EINVAL; + if (param) { + param = skip_spaces(param); + if (!*param) + param = NULL; + } system = strsep(&trigger, ":"); if (!trigger) -- cgit v1.2.3 From 20dc3847cc2fc886ee4eb9112e6e2fad9419b0c7 Mon Sep 17 00:00:00 2001 From: Sascha Ortmann Date: Thu, 18 Jun 2020 18:33:01 +0200 Subject: tracing/boottime: Fix kprobe multiple events Fix boottime kprobe events to report and abort after each failure when adding probes. As an example, when we try to set multiprobe kprobe events in bootconfig like this: ftrace.event.kprobes.vfsevents { probes = "vfs_read $arg1 $arg2,, !error! not reported;?", // leads to error "vfs_write $arg1 $arg2" } This will not work as expected. After commit da0f1f4167e3af69e ("tracing/boottime: Fix kprobe event API usage"), the function trace_boot_add_kprobe_event will not produce any error message when adding a probe fails at kprobe_event_gen_cmd_start. Furthermore, we continue to add probes when kprobe_event_gen_cmd_end fails (and kprobe_event_gen_cmd_start did not fail). In this case the function even returns successfully when the last call to kprobe_event_gen_cmd_end is successful. The behaviour of reporting and aborting after failures is not consistent. The function trace_boot_add_kprobe_event now reports each failure and stops adding probes immediately. Link: https://lkml.kernel.org/r/20200618163301.25854-1-sascha.ortmann@stud.uni-hannover.de Cc: stable@vger.kernel.org Cc: linux-kernel@i4.cs.fau.de Co-developed-by: Maximilian Werner Fixes: da0f1f4167e3 ("tracing/boottime: Fix kprobe event API usage") Acked-by: Masami Hiramatsu Signed-off-by: Maximilian Werner Signed-off-by: Sascha Ortmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 8b5490cb02bb..fa0fc08c6ef8 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -101,12 +101,16 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); ret = kprobe_event_gen_cmd_start(&cmd, event, val); - if (ret) + if (ret) { + pr_err("Failed to generate probe: %s\n", buf); break; + } ret = kprobe_event_gen_cmd_end(&cmd); - if (ret) + if (ret) { pr_err("Failed to add probe: %s\n", buf); + break; + } } return ret; -- cgit v1.2.3 From 005e696df65d0ff90468ecf38a50aa584dc82421 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Wed, 24 Jun 2020 15:33:26 +0300 Subject: gcc-plugins/stackleak: Don't instrument itself There is no need to try instrumenting functions in kernel/stackleak.c. Otherwise that can cause issues if the cleanup pass of stackleak gcc plugin is disabled. Signed-off-by: Alexander Popov Link: https://lore.kernel.org/r/20200624123330.83226-2-alex.popov@linux.com Signed-off-by: Kees Cook --- kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..155b5380500a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -125,6 +125,7 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o +CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n -- cgit v1.2.3 From feee1b8c490821f29aae416a5422795f5a29263d Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Wed, 24 Jun 2020 15:33:29 +0300 Subject: gcc-plugins/stackleak: Use asm instrumentation to avoid useless register saving The kernel code instrumentation in stackleak gcc plugin works in two stages. At first, stack tracking is added to GIMPLE representation of every function (except some special cases). And later, when stack frame size info is available, stack tracking is removed from the RTL representation of the functions with small stack frame. There is an unwanted side-effect for these functions: some of them do useless work with caller-saved registers. As an example of such case, proc_sys_write without() instrumentation: 55 push %rbp 41 b8 01 00 00 00 mov $0x1,%r8d 48 89 e5 mov %rsp,%rbp e8 11 ff ff ff callq ffffffff81284610 5d pop %rbp c3 retq 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 00 00 00 proc_sys_write() with instrumentation: 55 push %rbp 48 89 e5 mov %rsp,%rbp 41 56 push %r14 41 55 push %r13 41 54 push %r12 53 push %rbx 49 89 f4 mov %rsi,%r12 48 89 fb mov %rdi,%rbx 49 89 d5 mov %rdx,%r13 49 89 ce mov %rcx,%r14 4c 89 f1 mov %r14,%rcx 4c 89 ea mov %r13,%rdx 4c 89 e6 mov %r12,%rsi 48 89 df mov %rbx,%rdi 41 b8 01 00 00 00 mov $0x1,%r8d e8 f2 fe ff ff callq ffffffff81298e80 5b pop %rbx 41 5c pop %r12 41 5d pop %r13 41 5e pop %r14 5d pop %rbp c3 retq 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) 00 00 Let's improve the instrumentation to avoid this: 1. Make stackleak_track_stack() save all register that it works with. Use no_caller_saved_registers attribute for that function. This attribute is available for x86_64 and i386 starting from gcc-7. 2. Insert calling stackleak_track_stack() in asm: asm volatile("call stackleak_track_stack" :: "r" (current_stack_pointer)) Here we use ASM_CALL_CONSTRAINT trick from arch/x86/include/asm/asm.h. The input constraint is taken into account during gcc shrink-wrapping optimization. It is needed to be sure that stackleak_track_stack() call is inserted after the prologue of the containing function, when the stack frame is prepared. This work is a deep reengineering of the idea described on grsecurity blog https://grsecurity.net/resolving_an_unfortunate_stackleak_interaction Signed-off-by: Alexander Popov Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20200624123330.83226-5-alex.popov@linux.com Signed-off-by: Kees Cook --- kernel/stackleak.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index b193a59fc05b..a8fc9ae1d03d 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -104,19 +104,9 @@ asmlinkage void notrace stackleak_erase(void) } NOKPROBE_SYMBOL(stackleak_erase); -void __used notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers notrace stackleak_track_stack(void) { - /* - * N.B. stackleak_erase() fills the kernel stack with the poison value, - * which has the register width. That code assumes that the value - * of 'lowest_stack' is aligned on the register width boundary. - * - * That is true for x86 and x86_64 because of the kernel stack - * alignment on these platforms (for details, see 'cc_stack_align' in - * arch/x86/Makefile). Take care of that when you port STACKLEAK to - * new platforms. - */ - unsigned long sp = (unsigned long)&sp; + unsigned long sp = current_stack_pointer; /* * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than @@ -125,6 +115,8 @@ void __used notrace stackleak_track_stack(void) */ BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + /* 'lowest_stack' should be aligned on the register width boundary */ + sp = ALIGN(sp, sizeof(unsigned long)); if (sp < current->lowest_stack && sp >= (unsigned long)task_stack_page(current) + sizeof(unsigned long)) { -- cgit v1.2.3 From 521b512b157a1315ff2bf11c11ab184c79515aea Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:47 +0100 Subject: PM / EM: change naming convention from 'capacity' to 'performance' The Energy Model uses concept of performance domain and capacity states in order to calculate power used by CPUs. Change naming convention from capacity to performance state would enable wider usage in future, e.g. upcoming support for other devices other than CPUs. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 44 ++++++++++++++++++++++---------------------- kernel/sched/topology.c | 20 ++++++++++---------- 2 files changed, 32 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0a9326f5f421..9892d548a0fa 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -27,18 +27,18 @@ static DEFINE_MUTEX(em_pd_mutex); #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; -static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) { struct dentry *d; char name[24]; - snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + snprintf(name, sizeof(name), "ps:%lu", ps->frequency); - /* Create per-cs directory */ + /* Create per-ps directory */ d = debugfs_create_dir(name, pd); - debugfs_create_ulong("frequency", 0444, d, &cs->frequency); - debugfs_create_ulong("power", 0444, d, &cs->power); - debugfs_create_ulong("cost", 0444, d, &cs->cost); + debugfs_create_ulong("frequency", 0444, d, &ps->frequency); + debugfs_create_ulong("power", 0444, d, &ps->power); + debugfs_create_ulong("cost", 0444, d, &ps->cost); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -62,9 +62,9 @@ static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); - /* Create a sub-directory for each capacity state */ - for (i = 0; i < pd->nr_cap_states; i++) - em_debug_create_cs(&pd->table[i], d); + /* Create a sub-directory for each performance state */ + for (i = 0; i < pd->nr_perf_states; i++) + em_debug_create_ps(&pd->table[i], d); } static int __init em_debug_init(void) @@ -84,7 +84,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; int i, ret, cpu = cpumask_first(span); - struct em_cap_state *table; + struct em_perf_state *table; struct em_perf_domain *pd; u64 fmax; @@ -99,26 +99,26 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, if (!table) goto free_pd; - /* Build the list of capacity states for this performance domain */ + /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { /* * active_power() is a driver callback which ceils 'freq' to - * lowest capacity state of 'cpu' above 'freq' and updates + * lowest performance state of 'cpu' above 'freq' and updates * 'power' and 'freq' accordingly. */ ret = cb->active_power(&power, &freq, cpu); if (ret) { - pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); - goto free_cs_table; + pr_err("pd%d: invalid perf. state: %d\n", cpu, ret); + goto free_ps_table; } /* * We expect the driver callback to increase the frequency for - * higher capacity states. + * higher performance states. */ if (freq <= prev_freq) { pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); - goto free_cs_table; + goto free_ps_table; } /* @@ -127,7 +127,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, */ if (!power || power > EM_CPU_MAX_POWER) { pr_err("pd%d: invalid power: %lu\n", cpu, power); - goto free_cs_table; + goto free_ps_table; } table[i].power = power; @@ -141,12 +141,12 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, */ opp_eff = freq / power; if (opp_eff >= prev_opp_eff) - pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", + pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", cpu, i, i - 1); prev_opp_eff = opp_eff; } - /* Compute the cost of each capacity_state. */ + /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { table[i].cost = div64_u64(fmax * table[i].power, @@ -154,14 +154,14 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, } pd->table = table; - pd->nr_cap_states = nr_states; + pd->nr_perf_states = nr_states; cpumask_copy(to_cpumask(pd->cpus), span); em_debug_create_pd(pd, cpu); return pd; -free_cs_table: +free_ps_table: kfree(table); free_pd: kfree(pd); @@ -185,7 +185,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get); /** * em_register_perf_domain() - Register the Energy Model of a performance domain * @span : Mask of CPUs in the performance domain - * @nr_states : Number of capacity states to register + * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model * * Create Energy Model tables for a performance domain using the callbacks diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ba81187bb7af..2f91d3126365 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -272,10 +272,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map, printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); while (pd) { - printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)), cpumask_pr_args(perf_domain_span(pd)), - em_pd_nr_cap_states(pd->em_pd)); + em_pd_nr_perf_states(pd->em_pd)); pd = pd->next; } @@ -313,26 +313,26 @@ static void sched_energy_set(bool has_eas) * * The complexity of the Energy Model is defined as: * - * C = nr_pd * (nr_cpus + nr_cs) + * C = nr_pd * (nr_cpus + nr_ps) * * with parameters defined as: * - nr_pd: the number of performance domains * - nr_cpus: the number of CPUs - * - nr_cs: the sum of the number of capacity states of all performance + * - nr_ps: the sum of the number of performance states of all performance * domains (for example, on a system with 2 performance domains, - * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * with 10 performance states each, nr_ps = 2 * 10 = 20). * * It is generally not a good idea to use such a model in the wake-up path on * very complex platforms because of the associated scheduling overheads. The * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 capacity states each, for example. + * with per-CPU DVFS and less than 8 performance states each, for example. */ #define EM_MAX_COMPLEXITY 2048 extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); + int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; @@ -384,15 +384,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map) pd = tmp; /* - * Count performance domains and capacity states for the + * Count performance domains and performance states for the * complexity check. */ nr_pd++; - nr_cs += em_pd_nr_cap_states(pd->em_pd); + nr_ps += em_pd_nr_perf_states(pd->em_pd); } /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map)); goto free; -- cgit v1.2.3 From 7d9895c7fbfc9c70afce7029b7de0f3f974adb88 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:48 +0100 Subject: PM / EM: introduce em_dev_register_perf_domain function Add now function in the Energy Model framework which is going to support new devices. This function will help in transition and make it smoother. For now it still checks if the cpumask is a valid pointer, which will be removed later when the new structures and infrastructure will be ready. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 9892d548a0fa..875b163e54ab 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -125,7 +125,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, * The power returned by active_state() is expected to be * positive, in milli-watts and to fit into 16 bits. */ - if (!power || power > EM_CPU_MAX_POWER) { + if (!power || power > EM_MAX_POWER) { pr_err("pd%d: invalid power: %lu\n", cpu, power); goto free_ps_table; } @@ -183,10 +183,13 @@ struct em_perf_domain *em_cpu_get(int cpu) EXPORT_SYMBOL_GPL(em_cpu_get); /** - * em_register_perf_domain() - Register the Energy Model of a performance domain - * @span : Mask of CPUs in the performance domain + * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device + * @dev : Device for which the EM is to register * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model + * @span : Pointer to cpumask_t, which in case of a CPU device is + * obligatory. It can be taken from i.e. 'policy->cpus'. For other + * type of devices this should be set to NULL. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. @@ -196,14 +199,14 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * * Return 0 on success */ -int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb) +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + struct em_data_callback *cb, cpumask_t *span) { unsigned long cap, prev_cap = 0; struct em_perf_domain *pd; int cpu, ret = 0; - if (!span || !nr_states || !cb) + if (!dev || !span || !nr_states || !cb) return -EINVAL; /* @@ -255,4 +258,29 @@ unlock: return ret; } +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_register_perf_domain() - Register the Energy Model of a performance domain + * @span : Mask of CPUs in the performance domain + * @nr_states : Number of capacity states to register + * @cb : Callback functions providing the data of the Energy Model + * + * Create Energy Model tables for a performance domain using the callbacks + * defined in cb. + * + * If multiple clients register the same performance domain, all but the first + * registration will be ignored. + * + * Return 0 on success + */ +int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb) +{ + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpumask_first(span)); + + return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span); +} EXPORT_SYMBOL_GPL(em_register_perf_domain); -- cgit v1.2.3 From d0351cc3b0f57214d157e4d589564730af2aedae Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:49 +0100 Subject: PM / EM: update callback structure and add device pointer The Energy Model framework is going to support devices other that CPUs. In order to make this happen change the callback function and add pointer to a device as an argument. Update the related users to use new function and new callback from the Energy Model. Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Acked-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 875b163e54ab..5b8a1566526a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -78,8 +78,9 @@ core_initcall(em_debug_init); #else /* CONFIG_DEBUG_FS */ static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} #endif -static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, - struct em_data_callback *cb) +static struct em_perf_domain * +em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, + cpumask_t *span) { unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; @@ -106,7 +107,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, * lowest performance state of 'cpu' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, cpu); + ret = cb->active_power(&power, &freq, dev); if (ret) { pr_err("pd%d: invalid perf. state: %d\n", cpu, ret); goto free_ps_table; @@ -237,7 +238,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } /* Create the performance domain and add it to the Energy Model. */ - pd = em_create_pd(span, nr_states, cb); + pd = em_create_pd(dev, nr_states, cb, span); if (!pd) { ret = -EINVAL; goto unlock; -- cgit v1.2.3 From a67549c8e568627290234e9fbe833cb9dfd36b55 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:26 +0000 Subject: blktrace: annotate required lock on do_blk_trace_setup() Ensure it is clear which lock is required on do_blk_trace_setup(). Suggested-by: Bart Van Assche Signed-off-by: Luis Chamberlain Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5ef0484513ec..5a88a6b55933 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -483,6 +483,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct dentry *dir = NULL; int ret; + lockdep_assert_held(&q->blk_trace_mutex); + if (!buts->buf_size || !buts->buf_nr) return -EINVAL; -- cgit v1.2.3 From bad8e64fb19d3a0de5e564d9a7271c31bd684369 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:28 +0000 Subject: blktrace: fix debugfs use after free On commit 6ac93117ab00 ("blktrace: use existing disk debugfs directory") merged on v4.12 Omar fixed the original blktrace code for request-based drivers (multiqueue). This however left in place a possible crash, if you happen to abuse blktrace while racing to remove / add a device. We used to use asynchronous removal of the request_queue, and with that the issue was easier to reproduce. Now that we have reverted to synchronous removal of the request_queue, the issue is still possible to reproduce, its however just a bit more difficult. We essentially run two instances of break-blktrace which add/remove a loop device, and setup a blktrace and just never tear the blktrace down. We do this twice in parallel. This is easily reproduced with the script run_0004.sh from break-blktrace [0]. We can end up with two types of panics each reflecting where we race, one a failed blktrace setup: [ 252.426751] debugfs: Directory 'loop0' with parent 'block' already present! [ 252.432265] BUG: kernel NULL pointer dereference, address: 00000000000000a0 [ 252.436592] #PF: supervisor write access in kernel mode [ 252.439822] #PF: error_code(0x0002) - not-present page [ 252.442967] PGD 0 P4D 0 [ 252.444656] Oops: 0002 [#1] SMP NOPTI [ 252.446972] CPU: 10 PID: 1153 Comm: break-blktrace Tainted: G E 5.7.0-rc2-next-20200420+ #164 [ 252.452673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 [ 252.456343] RIP: 0010:down_write+0x15/0x40 [ 252.458146] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00 00 00 48 0f b1 55 00 75 0f 48 8b 04 25 c0 8b 01 00 48 89 45 08 5d [ 252.463638] RSP: 0018:ffffa626415abcc8 EFLAGS: 00010246 [ 252.464950] RAX: 0000000000000000 RBX: ffff958c25f0f5c0 RCX: ffffff8100000000 [ 252.466727] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0 [ 252.468482] RBP: 00000000000000a0 R08: 0000000000000000 R09: 0000000000000001 [ 252.470014] R10: 0000000000000000 R11: ffff958d1f9227ff R12: 0000000000000000 [ 252.471473] R13: ffff958c25ea5380 R14: ffffffff8cce15f1 R15: 00000000000000a0 [ 252.473346] FS: 00007f2e69dee540(0000) GS:ffff958c2fc80000(0000) knlGS:0000000000000000 [ 252.475225] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 252.476267] CR2: 00000000000000a0 CR3: 0000000427d10004 CR4: 0000000000360ee0 [ 252.477526] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 252.478776] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 252.479866] Call Trace: [ 252.480322] simple_recursive_removal+0x4e/0x2e0 [ 252.481078] ? debugfs_remove+0x60/0x60 [ 252.481725] ? relay_destroy_buf+0x77/0xb0 [ 252.482662] debugfs_remove+0x40/0x60 [ 252.483518] blk_remove_buf_file_callback+0x5/0x10 [ 252.484328] relay_close_buf+0x2e/0x60 [ 252.484930] relay_open+0x1ce/0x2c0 [ 252.485520] do_blk_trace_setup+0x14f/0x2b0 [ 252.486187] __blk_trace_setup+0x54/0xb0 [ 252.486803] blk_trace_ioctl+0x90/0x140 [ 252.487423] ? do_sys_openat2+0x1ab/0x2d0 [ 252.488053] blkdev_ioctl+0x4d/0x260 [ 252.488636] block_ioctl+0x39/0x40 [ 252.489139] ksys_ioctl+0x87/0xc0 [ 252.489675] __x64_sys_ioctl+0x16/0x20 [ 252.490380] do_syscall_64+0x52/0x180 [ 252.491032] entry_SYSCALL_64_after_hwframe+0x44/0xa9 And the other on the device removal: [ 128.528940] debugfs: Directory 'loop0' with parent 'block' already present! [ 128.615325] BUG: kernel NULL pointer dereference, address: 00000000000000a0 [ 128.619537] #PF: supervisor write access in kernel mode [ 128.622700] #PF: error_code(0x0002) - not-present page [ 128.625842] PGD 0 P4D 0 [ 128.627585] Oops: 0002 [#1] SMP NOPTI [ 128.629871] CPU: 12 PID: 544 Comm: break-blktrace Tainted: G E 5.7.0-rc2-next-20200420+ #164 [ 128.635595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 [ 128.640471] RIP: 0010:down_write+0x15/0x40 [ 128.643041] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00 00 00 48 0f b1 55 00 75 0f 65 48 8b 04 25 c0 8b 01 00 48 89 45 08 5d [ 128.650180] RSP: 0018:ffffa9c3c05ebd78 EFLAGS: 00010246 [ 128.651820] RAX: 0000000000000000 RBX: ffff8ae9a6370240 RCX: ffffff8100000000 [ 128.653942] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0 [ 128.655720] RBP: 00000000000000a0 R08: 0000000000000002 R09: ffff8ae9afd2d3d0 [ 128.657400] R10: 0000000000000056 R11: 0000000000000000 R12: 0000000000000000 [ 128.659099] R13: 0000000000000000 R14: 0000000000000003 R15: 00000000000000a0 [ 128.660500] FS: 00007febfd995540(0000) GS:ffff8ae9afd00000(0000) knlGS:0000000000000000 [ 128.662204] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 128.663426] CR2: 00000000000000a0 CR3: 0000000420042003 CR4: 0000000000360ee0 [ 128.664776] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 128.666022] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 128.667282] Call Trace: [ 128.667801] simple_recursive_removal+0x4e/0x2e0 [ 128.668663] ? debugfs_remove+0x60/0x60 [ 128.669368] debugfs_remove+0x40/0x60 [ 128.669985] blk_trace_free+0xd/0x50 [ 128.670593] __blk_trace_remove+0x27/0x40 [ 128.671274] blk_trace_shutdown+0x30/0x40 [ 128.671935] blk_release_queue+0x95/0xf0 [ 128.672589] kobject_put+0xa5/0x1b0 [ 128.673188] disk_release+0xa2/0xc0 [ 128.673786] device_release+0x28/0x80 [ 128.674376] kobject_put+0xa5/0x1b0 [ 128.674915] loop_remove+0x39/0x50 [loop] [ 128.675511] loop_control_ioctl+0x113/0x130 [loop] [ 128.676199] ksys_ioctl+0x87/0xc0 [ 128.676708] __x64_sys_ioctl+0x16/0x20 [ 128.677274] do_syscall_64+0x52/0x180 [ 128.677823] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The common theme here is: debugfs: Directory 'loop0' with parent 'block' already present This crash happens because of how blktrace uses the debugfs directory where it places its files. Upon init we always create the same directory which would be needed by blktrace but we only do this for make_request drivers (multiqueue) block drivers. When you race a removal of these devices with a blktrace setup you end up in a situation where the make_request recursive debugfs removal will sweep away the blktrace files and then later blktrace will also try to remove individual dentries which are already NULL. The inverse is also possible and hence the two types of use after frees. We don't create the block debugfs directory on init for these types of block devices: * request-based block driver block devices * every possible partition * scsi-generic And so, this race should in theory only be possible with make_request drivers. We can fix the UAF by simply re-using the debugfs directory for make_request drivers (multiqueue) and only creating the ephemeral directory for the other type of block devices. The new clarifications on relying on the q->blk_trace_mutex *and* also checking for q->blk_trace *prior* to processing a blktrace ensures the debugfs directories are only created if no possible directory name clashes are possible. This goes tested with: o nvme partitions o ISCSI with tgt, and blktracing against scsi-generic with: o block o tape o cdrom o media changer o blktests This patch is part of the work which disputes the severity of CVE-2019-19770 which shows this issue is not a core debugfs issue, but a misuse of debugfs within blktace. Fixes: 6ac93117ab00 ("blktrace: use existing disk debugfs directory") Reported-by: syzbot+603294af2d01acfdd6da@syzkaller.appspotmail.com Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Cc: Bart Van Assche Cc: Omar Sandoval Cc: Hannes Reinecke Cc: Nicolai Stange Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: "Martin K. Petersen" Cc: "James E.J. Bottomley" Cc: yu kuai Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5a88a6b55933..e27dee345d81 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -524,10 +524,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; - ret = -ENOENT; - - dir = debugfs_lookup(buts->name, blk_debugfs_root); - if (!dir) +#ifdef CONFIG_BLK_DEBUG_FS + /* + * When tracing whole make_request drivers (multiqueue) block devices, + * reuse the existing debugfs directory created by the block layer on + * init. For request-based block devices, all partitions block devices, + * and scsi-generic block devices we create a temporary new debugfs + * directory that will be removed once the trace ends. + */ + if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains) + dir = q->debugfs_dir; + else +#endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); bt->dev = dev; @@ -565,8 +573,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: - if (dir && !bt->dir) - dput(dir); if (ret) blk_trace_free(bt); return ret; -- cgit v1.2.3 From b431ef837e3374da0db8ff6683170359aaa0859c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:29 +0000 Subject: blktrace: ensure our debugfs dir exists We make an assumption that a debugfs directory exists, but since this can fail ensure it exists before allowing blktrace setup to complete. Otherwise we end up stuffing blktrace files on the debugfs root directory. In the worst case scenario this *in theory* can create an eventual panic *iff* in the future a similarly named file is created prior on the debugfs root directory. This theoretical crash can happen due to a recursive removal followed by a specific dentry removal. This doesn't fix any known crash, however I have seen the files go into the main debugfs root directory in cases where the debugfs directory was not created due to other internal bugs with blktrace now fixed. blktrace is also completely useless without this directory, so this ensures to userspace we only setup blktrace if the kernel can stuff files where they are supposed to go into. debugfs directory creations typically aren't checked for, and we have maintainers doing sweep removals of these checks, but since we need this check to ensure proper userspace blktrace functionality we make sure to annotate the justification for the check. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e27dee345d81..46c273f4dec5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -538,6 +538,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, #endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + /* + * As blktrace relies on debugfs for its interface the debugfs directory + * is required, contrary to the usual mantra of not checking for debugfs + * files or directories. + */ + if (IS_ERR_OR_NULL(dir)) { + pr_warn("debugfs_dir not present for %s so skipping\n", + buts->name); + ret = -ENOENT; + goto err; + } + bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); -- cgit v1.2.3 From 85e0cbbb8a79537dbc465e9deb449a08b2b092a6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:30 +0000 Subject: block: create the request_queue debugfs_dir on registration We were only creating the request_queue debugfs_dir only for make_request block drivers (multiqueue), but never for request-based block drivers. We did this as we were only creating non-blktrace additional debugfs files on that directory for make_request drivers. However, since blktrace *always* creates that directory anyway, we special-case the use of that directory on blktrace. Other than this being an eye-sore, this exposes request-based block drivers to the same debugfs fragile race that used to exist with make_request block drivers where if we start adding files onto that directory we can later run a race with a double removal of dentries on the directory if we don't deal with this carefully on blktrace. Instead, just simplify things by always creating the request_queue debugfs_dir on request_queue registration. Rename the mutex also to reflect the fact that this is used outside of the blktrace context. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 58 ++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 46c273f4dec5..c086c38f4954 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -348,7 +348,7 @@ static int __blk_trace_remove(struct request_queue *q) struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; @@ -362,9 +362,9 @@ int blk_trace_remove(struct request_queue *q) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_remove(q); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -483,14 +483,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct dentry *dir = NULL; int ret; - lockdep_assert_held(&q->blk_trace_mutex); + lockdep_assert_held(&q->debugfs_mutex); if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - if (!blk_debugfs_root) - return -ENOENT; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -505,7 +502,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * we can be. */ if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { pr_warn("Concurrent blktraces are not allowed on %s\n", buts->name); return -EBUSY; @@ -524,18 +521,15 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; -#ifdef CONFIG_BLK_DEBUG_FS /* - * When tracing whole make_request drivers (multiqueue) block devices, - * reuse the existing debugfs directory created by the block layer on - * init. For request-based block devices, all partitions block devices, + * When tracing the whole disk reuse the existing debugfs directory + * created by the block layer on init. For partitions block devices, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ - if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains) + if (bdev && bdev == bdev->bd_contains) dir = q->debugfs_dir; else -#endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); /* @@ -617,9 +611,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -665,7 +659,7 @@ static int __blk_trace_startstop(struct request_queue *q, int start) struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -705,9 +699,9 @@ int blk_trace_startstop(struct request_queue *q, int start) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_startstop(q, start); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -736,7 +730,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); switch (cmd) { case BLKTRACESETUP: @@ -763,7 +757,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -774,14 +768,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP @@ -1662,7 +1656,7 @@ static int blk_trace_remove_queue(struct request_queue *q) struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -1837,10 +1831,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; @@ -1858,7 +1852,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: @@ -1901,10 +1895,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) { ret = 0; @@ -1921,7 +1915,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); } if (ret == 0) { @@ -1936,7 +1930,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: -- cgit v1.2.3 From 1bc138c622959979eb547be2d3bbc6442a5c80b0 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 10 Jun 2020 11:12:23 +0100 Subject: PM / EM: add support for other devices than CPUs in Energy Model Add support for other devices than CPUs. The registration function does not require a valid cpumask pointer and is ready to handle new devices. Some of the internal structures has been reorganized in order to keep consistent view (like removing per_cpu pd pointers). Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 244 ++++++++++++++++++++++++++++++-------------- 1 file changed, 167 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 5b8a1566526a..32d76e78f992 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Energy Model of CPUs + * Energy Model of devices * - * Copyright (c) 2018, Arm ltd. + * Copyright (c) 2018-2020, Arm ltd. * Written by: Quentin Perret, Arm ltd. + * Improvements provided by: Lukasz Luba, Arm ltd. */ #define pr_fmt(fmt) "energy_model: " fmt @@ -15,15 +16,17 @@ #include #include -/* Mapping of each CPU to the performance domain to which it belongs. */ -static DEFINE_PER_CPU(struct em_perf_domain *, em_data); - /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +static bool _is_cpu_device(struct device *dev) +{ + return (dev->bus == &cpu_subsys); +} + #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; @@ -49,22 +52,30 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +static void em_debug_create_pd(struct device *dev) { struct dentry *d; - char name[8]; int i; - snprintf(name, sizeof(name), "pd%d", cpu); - /* Create the directory of the performance domain */ - d = debugfs_create_dir(name, rootdir); + d = debugfs_create_dir(dev_name(dev), rootdir); - debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + if (_is_cpu_device(dev)) + debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, + &em_debug_cpus_fops); /* Create a sub-directory for each performance state */ - for (i = 0; i < pd->nr_perf_states; i++) - em_debug_create_ps(&pd->table[i], d); + for (i = 0; i < dev->em_pd->nr_perf_states; i++) + em_debug_create_ps(&dev->em_pd->table[i], d); + +} + +static void em_debug_remove_pd(struct device *dev) +{ + struct dentry *debug_dir; + + debug_dir = debugfs_lookup(dev_name(dev), rootdir); + debugfs_remove_recursive(debug_dir); } static int __init em_debug_init(void) @@ -76,40 +87,34 @@ static int __init em_debug_init(void) } core_initcall(em_debug_init); #else /* CONFIG_DEBUG_FS */ -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +static void em_debug_create_pd(struct device *dev) {} +static void em_debug_remove_pd(struct device *dev) {} #endif -static struct em_perf_domain * -em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, - cpumask_t *span) + +static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, + int nr_states, struct em_data_callback *cb) { unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; - int i, ret, cpu = cpumask_first(span); struct em_perf_state *table; - struct em_perf_domain *pd; + int i, ret; u64 fmax; - if (!cb->active_power) - return NULL; - - pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); - if (!pd) - return NULL; - table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); if (!table) - goto free_pd; + return -ENOMEM; /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { /* * active_power() is a driver callback which ceils 'freq' to - * lowest performance state of 'cpu' above 'freq' and updates + * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ ret = cb->active_power(&power, &freq, dev); if (ret) { - pr_err("pd%d: invalid perf. state: %d\n", cpu, ret); + dev_err(dev, "EM: invalid perf. state: %d\n", + ret); goto free_ps_table; } @@ -118,7 +123,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, * higher performance states. */ if (freq <= prev_freq) { - pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); + dev_err(dev, "EM: non-increasing freq: %lu\n", + freq); goto free_ps_table; } @@ -127,7 +133,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, * positive, in milli-watts and to fit into 16 bits. */ if (!power || power > EM_MAX_POWER) { - pr_err("pd%d: invalid power: %lu\n", cpu, power); + dev_err(dev, "EM: invalid power: %lu\n", + power); goto free_ps_table; } @@ -142,8 +149,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, */ opp_eff = freq / power; if (opp_eff >= prev_opp_eff) - pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", - cpu, i, i - 1); + dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", + i, i - 1); prev_opp_eff = opp_eff; } @@ -156,30 +163,82 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, pd->table = table; pd->nr_perf_states = nr_states; - cpumask_copy(to_cpumask(pd->cpus), span); - em_debug_create_pd(pd, cpu); - - return pd; + return 0; free_ps_table: kfree(table); -free_pd: - kfree(pd); + return -EINVAL; +} + +static int em_create_pd(struct device *dev, int nr_states, + struct em_data_callback *cb, cpumask_t *cpus) +{ + struct em_perf_domain *pd; + struct device *cpu_dev; + int cpu, ret; + + if (_is_cpu_device(dev)) { + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return -ENOMEM; + + cpumask_copy(em_span_cpus(pd), cpus); + } else { + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return -ENOMEM; + } + + ret = em_create_perf_table(dev, pd, nr_states, cb); + if (ret) { + kfree(pd); + return ret; + } + + if (_is_cpu_device(dev)) + for_each_cpu(cpu, cpus) { + cpu_dev = get_cpu_device(cpu); + cpu_dev->em_pd = pd; + } + + dev->em_pd = pd; + + return 0; +} + +/** + * em_pd_get() - Return the performance domain for a device + * @dev : Device to find the performance domain for + * + * Returns the performance domain to which @dev belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_pd_get(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return NULL; - return NULL; + return dev->em_pd; } +EXPORT_SYMBOL_GPL(em_pd_get); /** * em_cpu_get() - Return the performance domain for a CPU * @cpu : CPU to find the performance domain for * - * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't + * Returns the performance domain to which @cpu belongs, or NULL if it doesn't * exist. */ struct em_perf_domain *em_cpu_get(int cpu) { - return READ_ONCE(per_cpu(em_data, cpu)); + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return NULL; + + return em_pd_get(cpu_dev); } EXPORT_SYMBOL_GPL(em_cpu_get); @@ -188,7 +247,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * @dev : Device for which the EM is to register * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model - * @span : Pointer to cpumask_t, which in case of a CPU device is + * @cpus : Pointer to cpumask_t, which in case of a CPU device is * obligatory. It can be taken from i.e. 'policy->cpus'. For other * type of devices this should be set to NULL. * @@ -201,13 +260,12 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span) + struct em_data_callback *cb, cpumask_t *cpus) { unsigned long cap, prev_cap = 0; - struct em_perf_domain *pd; - int cpu, ret = 0; + int cpu, ret; - if (!dev || !span || !nr_states || !cb) + if (!dev || !nr_states || !cb) return -EINVAL; /* @@ -216,47 +274,50 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, */ mutex_lock(&em_pd_mutex); - for_each_cpu(cpu, span) { - /* Make sure we don't register again an existing domain. */ - if (READ_ONCE(per_cpu(em_data, cpu))) { - ret = -EEXIST; - goto unlock; - } + if (dev->em_pd) { + ret = -EEXIST; + goto unlock; + } - /* - * All CPUs of a domain must have the same micro-architecture - * since they all share the same table. - */ - cap = arch_scale_cpu_capacity(cpu); - if (prev_cap && prev_cap != cap) { - pr_err("CPUs of %*pbl must have the same capacity\n", - cpumask_pr_args(span)); + if (_is_cpu_device(dev)) { + if (!cpus) { + dev_err(dev, "EM: invalid CPU mask\n"); ret = -EINVAL; goto unlock; } - prev_cap = cap; + + for_each_cpu(cpu, cpus) { + if (em_cpu_get(cpu)) { + dev_err(dev, "EM: exists for CPU%d\n", cpu); + ret = -EEXIST; + goto unlock; + } + /* + * All CPUs of a domain must have the same + * micro-architecture since they all share the same + * table. + */ + cap = arch_scale_cpu_capacity(cpu); + if (prev_cap && prev_cap != cap) { + dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(cpus)); + + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } } - /* Create the performance domain and add it to the Energy Model. */ - pd = em_create_pd(dev, nr_states, cb, span); - if (!pd) { - ret = -EINVAL; + ret = em_create_pd(dev, nr_states, cb, cpus); + if (ret) goto unlock; - } - for_each_cpu(cpu, span) { - /* - * The per-cpu array can be read concurrently from em_cpu_get(). - * The barrier enforces the ordering needed to make sure readers - * can only access well formed em_perf_domain structs. - */ - smp_store_release(per_cpu_ptr(&em_data, cpu), pd); - } + em_debug_create_pd(dev); + dev_info(dev, "EM: created perf domain\n"); - pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); unlock: mutex_unlock(&em_pd_mutex); - return ret; } EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); @@ -285,3 +346,32 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span); } EXPORT_SYMBOL_GPL(em_register_perf_domain); + +/** + * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device + * @dev : Device for which the EM is registered + * + * Unregister the EM for the specified @dev (but not a CPU device). + */ +void em_dev_unregister_perf_domain(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev) || !dev->em_pd) + return; + + if (_is_cpu_device(dev)) + return; + + /* + * The mutex separates all register/unregister requests and protects + * from potential clean-up/setup issues in the debugfs directories. + * The debugfs directory name is the same as device's name. + */ + mutex_lock(&em_pd_mutex); + em_debug_remove_pd(dev); + + kfree(dev->em_pd->table); + kfree(dev->em_pd); + dev->em_pd = NULL; + mutex_unlock(&em_pd_mutex); +} +EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -- cgit v1.2.3 From 07891f15d91317b2220a0b610a2d7e324a88105d Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:51 +0100 Subject: PM / EM: remove em_register_perf_domain Remove old function em_register_perf_domain which is no longer needed. There is em_dev_register_perf_domain that covers old use cases and new as well. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 32d76e78f992..c1ff7fa030ab 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -322,31 +322,6 @@ unlock: } EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); -/** - * em_register_perf_domain() - Register the Energy Model of a performance domain - * @span : Mask of CPUs in the performance domain - * @nr_states : Number of capacity states to register - * @cb : Callback functions providing the data of the Energy Model - * - * Create Energy Model tables for a performance domain using the callbacks - * defined in cb. - * - * If multiple clients register the same performance domain, all but the first - * registration will be ignored. - * - * Return 0 on success - */ -int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb) -{ - struct device *cpu_dev; - - cpu_dev = get_cpu_device(cpumask_first(span)); - - return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span); -} -EXPORT_SYMBOL_GPL(em_register_perf_domain); - /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device * @dev : Device for which the EM is registered -- cgit v1.2.3 From f0b5694791ce70dba16758c3b838d5ddc7731b02 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:52 +0100 Subject: PM / EM: change name of em_pd_energy to em_cpu_energy Energy Model framework now supports other devices than CPUs. Refactor some of the functions in order to prevent wrong usage. The old function em_pd_energy has to generic name. It must not be used without proper cpumask pointer, which is possible only for CPU devices. Thus, rename it and add proper description to warn of potential wrong usage for other devices. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbcb2f71599b..6da601c8d383 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6497,7 +6497,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) max_util = max(max_util, cpu_util); } - return em_pd_energy(pd->em_pd, max_util, sum_util); + return em_cpu_energy(pd->em_pd, max_util, sum_util); } /* -- cgit v1.2.3 From c06b0229579806f5b5e14af64cf9c5dc771445b3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:07 -0700 Subject: bpf: Support 'X' in bpf_seq_printf() helper 'X' tells kernel to print hex with upper case letters. /proc/net/tcp{4,6} seq_file show() used this, and supports it in bpf_seq_printf() helper too. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230807.3988014-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a3ac7de98baa..b44505f56b6d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -681,7 +681,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, } if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x') { + fmt[i] != 'u' && fmt[i] != 'x' && + fmt[i] != 'X') { err = -EINVAL; goto out; } -- cgit v1.2.3 From 72e2b2b66f9c1225e51fc4a1c1e8512959195b76 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:08 -0700 Subject: bpf: Allow tracing programs to use bpf_jiffies64() helper /proc/net/tcp{4,6} uses jiffies for various computations. Let us add bpf_jiffies64() helper to tracing program so bpf_iter and other programs can use it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230808.3988073-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b44505f56b6d..0159f12d2af5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1135,6 +1135,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; default: return NULL; } -- cgit v1.2.3 From af7ec13833619e17f03aa73a785a2f871da6d66b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:09 -0700 Subject: bpf: Add bpf_skc_to_tcp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a tcp6_sock pointer. The return value could be NULL if the casting is illegal. A new helper return type RET_PTR_TO_BTF_ID_OR_NULL is added so the verifier is able to deduce proper return types for the helper. Different from the previous BTF_ID based helpers, the bpf_skc_to_tcp6_sock() argument can be several possible btf_ids. More specifically, all possible socket data structures with sock_common appearing in the first in the memory layout. This patch only added socket types related to tcp and udp. All possible argument btf_id and return value btf_id for helper bpf_skc_to_tcp6_sock() are pre-calculcated and cached. In the future, it is even possible to precompute these btf_id's at kernel build time. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230809.3988195-1-yhs@fb.com --- kernel/bpf/btf.c | 1 + kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++++++++++---------- kernel/trace/bpf_trace.c | 2 ++ 3 files changed, 36 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e377d1981730..4c3007f428b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3674,6 +3674,7 @@ struct btf *btf_parse_vmlinux(void) goto errout; bpf_struct_ops_init(btf, log); + init_btf_sock_ids(btf); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7460f967cb75..7de98906ddf4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3800,12 +3800,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } -static int check_func_arg(struct bpf_verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) +static int check_func_arg(struct bpf_verifier_env *env, u32 arg, + struct bpf_call_arg_meta *meta, + const struct bpf_func_proto *fn) { + u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; + enum bpf_arg_type arg_type = fn->arg_type[arg]; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3885,9 +3887,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), + if (!fn->check_btf_id) { + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + } else if (!fn->check_btf_id(reg->btf_id, arg)) { + verbose(env, "Helper does not support %s in R%d\n", kernel_type_name(reg->btf_id), regno); return -EACCES; @@ -4709,10 +4718,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (!fn->check_btf_id) { + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; + } + err = check_func_arg(env, i, &meta, fn); if (err) return err; } @@ -4815,6 +4826,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + int ret_btf_id; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + ret_btf_id = *fn->ret_btf_id; + if (ret_btf_id == 0) { + verbose(env, "invalid return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); + return -EINVAL; + } + regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0159f12d2af5..2a97a268f533 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1515,6 +1515,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; + case BPF_FUNC_skc_to_tcp6_sock: + return &bpf_skc_to_tcp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? -- cgit v1.2.3 From 478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:11 -0700 Subject: bpf: Add bpf_skc_to_{tcp, tcp_timewait, tcp_request}_sock() helpers Three more helpers are added to cast a sock_common pointer to an tcp_sock, tcp_timewait_sock or a tcp_request_sock for tracing programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230811.3988277-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2a97a268f533..48d935b0d87c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1517,6 +1517,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_output_proto; case BPF_FUNC_skc_to_tcp6_sock: return &bpf_skc_to_tcp6_sock_proto; + case BPF_FUNC_skc_to_tcp_sock: + return &bpf_skc_to_tcp_sock_proto; + case BPF_FUNC_skc_to_tcp_timewait_sock: + return &bpf_skc_to_tcp_timewait_sock_proto; + case BPF_FUNC_skc_to_tcp_request_sock: + return &bpf_skc_to_tcp_request_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? -- cgit v1.2.3 From 0d4fad3e57df2bf61e8ffc8d12a34b1caf9b8835 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:15 -0700 Subject: bpf: Add bpf_skc_to_udp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a udp6_sock pointer. The return value could be NULL if the casting is illegal. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/20200623230815.3988481-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 48d935b0d87c..5d59dda5f661 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1523,6 +1523,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_timewait_sock_proto; case BPF_FUNC_skc_to_tcp_request_sock: return &bpf_skc_to_tcp_request_sock_proto; + case BPF_FUNC_skc_to_udp6_sock: + return &bpf_skc_to_udp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? -- cgit v1.2.3 From 9d71b344f86f4264a5fae43c997a630e93c0de9b Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Thu, 4 Jun 2020 15:31:16 +0530 Subject: kdb: Re-factor kdb_printf() message write code Re-factor kdb_printf() message write code in order to avoid duplication of code and thereby increase readability. Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/1591264879-25920-2-git-send-email-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 57 +++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 924bc9298a42..2d42a02de40e 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -542,6 +542,29 @@ static int kdb_search_string(char *searched, char *searchfor) return 0; } +static void kdb_msg_write(const char *msg, int msg_len) +{ + struct console *c; + + if (msg_len == 0) + return; + + if (dbg_io_ops && !dbg_io_ops->is_console) { + const char *cp = msg; + int len = msg_len; + + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; + } + } + + for_each_console(c) { + c->write(c, msg, msg_len); + touch_nmi_watchdog(); + } +} + int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) { int diag; @@ -553,7 +576,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; - struct console *c; unsigned long uninitialized_var(flags); /* Serialize kdb_printf if multiple cpus try to write at once. @@ -687,22 +709,11 @@ kdb_printit: */ retlen = strlen(kdb_buffer); cp = (char *) printk_skip_headers(kdb_buffer); - if (!dbg_kdb_mode && kgdb_connected) { + if (!dbg_kdb_mode && kgdb_connected) gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); - } else { - if (dbg_io_ops && !dbg_io_ops->is_console) { - len = retlen - (cp - kdb_buffer); - cp2 = cp; - while (len--) { - dbg_io_ops->write_char(*cp2); - cp2++; - } - } - for_each_console(c) { - c->write(c, cp, retlen - (cp - kdb_buffer)); - touch_nmi_watchdog(); - } - } + else + kdb_msg_write(cp, retlen - (cp - kdb_buffer)); + if (logging) { saved_loglevel = console_loglevel; console_loglevel = CONSOLE_LOGLEVEL_SILENT; @@ -751,19 +762,7 @@ kdb_printit: moreprompt = "more> "; kdb_input_flush(); - - if (dbg_io_ops && !dbg_io_ops->is_console) { - len = strlen(moreprompt); - cp = moreprompt; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } - } - for_each_console(c) { - c->write(c, moreprompt, strlen(moreprompt)); - touch_nmi_watchdog(); - } + kdb_msg_write(moreprompt, strlen(moreprompt)); if (logging) printk("%s", moreprompt); -- cgit v1.2.3 From e8857288bb620d594c94a219148d18562e52b06e Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Thu, 4 Jun 2020 15:31:17 +0530 Subject: kdb: Check status of console prior to invoking handlers Check if a console is enabled prior to invoking corresponding write handler. Suggested-by: Sergey Senozhatsky Signed-off-by: Sumit Garg Reviewed-by: Daniel Thompson Reviewed-by: Douglas Anderson Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/1591264879-25920-3-git-send-email-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 2d42a02de40e..58b7d256d0a4 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -560,6 +560,8 @@ static void kdb_msg_write(const char *msg, int msg_len) } for_each_console(c) { + if (!(c->flags & CON_ENABLED)) + continue; c->write(c, msg, msg_len); touch_nmi_watchdog(); } -- cgit v1.2.3 From 2a78b85b70f9c3d450619d369d349ba861320510 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Thu, 4 Jun 2020 15:31:18 +0530 Subject: kdb: Make kdb_printf() console handling more robust While rounding up CPUs via NMIs, its possible that a rounded up CPU maybe holding a console port lock leading to kgdb master CPU stuck in a deadlock during invocation of console write operations. A similar deadlock could also be possible while using synchronous breakpoints. So in order to avoid such a deadlock, set oops_in_progress to encourage the console drivers to disregard their internal spin locks: in the current calling context the risk of deadlock is a bigger problem than risks due to re-entering the console driver. We operate directly on oops_in_progress rather than using bust_spinlocks() because the calls bust_spinlocks() makes on exit are not appropriate for this calling context. Suggested-by: Sergey Senozhatsky Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/1591264879-25920-4-git-send-email-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 58b7d256d0a4..0e4f2eda96d8 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -562,7 +562,18 @@ static void kdb_msg_write(const char *msg, int msg_len) for_each_console(c) { if (!(c->flags & CON_ENABLED)) continue; + /* + * Set oops_in_progress to encourage the console drivers to + * disregard their internal spin locks: in the current calling + * context the risk of deadlock is a bigger problem than risks + * due to re-entering the console driver. We operate directly on + * oops_in_progress rather than using bust_spinlocks() because + * the calls bust_spinlocks() makes on exit are not appropriate + * for this calling context. + */ + ++oops_in_progress; c->write(c, msg, msg_len); + --oops_in_progress; touch_nmi_watchdog(); } } -- cgit v1.2.3 From 590d69796346353878b275c5512c664e3f875f24 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Dec 2019 16:44:52 -0500 Subject: sched: Force the address order of each sched class descriptor In order to make a micro optimization in pick_next_task(), the order of the sched class descriptor address must be in the same order as their priority to each other. That is: &idle_sched_class < &fair_sched_class < &rt_sched_class < &dl_sched_class < &stop_sched_class In order to guarantee this order of the sched class descriptors, add each one into their own data section and force the order in the linker script. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/157675913272.349305.8936736338884044103.stgit@localhost.localdomain --- kernel/sched/deadline.c | 3 ++- kernel/sched/fair.c | 3 ++- kernel/sched/idle.c | 3 ++- kernel/sched/rt.c | 3 ++- kernel/sched/stop_task.c | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d4708e29008f..d9e79462993b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2479,7 +2479,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class = { +const struct sched_class dl_sched_class + __attribute__((section("__dl_sched_class"))) = { .next = &rt_sched_class, .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0424a0af5f87..3365f6b07c36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11122,7 +11122,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class = { +const struct sched_class fair_sched_class + __attribute__((section("__fair_sched_class"))) = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8d75ca201484..f5806295356b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -453,7 +453,8 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { +const struct sched_class idle_sched_class + __attribute__((section("__idle_sched_class"))) = { /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f395ddb75f38..6543d4430331 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2429,7 +2429,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class = { +const struct sched_class rt_sched_class + __attribute__((section("__rt_sched_class"))) = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 3e50a6a8f1e5..f4bbd54caae0 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -109,7 +109,8 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class = { +const struct sched_class stop_sched_class + __attribute__((section("__stop_sched_class"))) = { .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, -- cgit v1.2.3 From c3a340f7e7eadac7662ab104ceb16432e5a4c6b2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Dec 2019 16:44:53 -0500 Subject: sched: Have sched_class_highest define by vmlinux.lds.h Now that the sched_class descriptors are defined by the linker script, and this needs to be aware of the existance of stop_sched_class when SMP is enabled or not, as it is used as the "highest" priority when defined. Move the declaration of sched_class_highest to the same location in the linker script that inserts stop_sched_class, and this will also make it easier to see what should be defined as the highest class, as this linker script location defines the priorities as well. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191219214558.682913590@goodmis.org --- kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 17 +++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0208b71bef80..81640fe0eae8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6646,6 +6646,14 @@ void __init sched_init(void) unsigned long ptr = 0; int i; + /* Make sure the linker didn't screw up */ + BUG_ON(&idle_sched_class + 1 != &fair_sched_class || + &fair_sched_class + 1 != &rt_sched_class || + &rt_sched_class + 1 != &dl_sched_class); +#ifdef CONFIG_SMP + BUG_ON(&dl_sched_class + 1 != &stop_sched_class); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 336887607b3d..4165c06d1d7b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1811,7 +1811,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -}; +} __aligned(32); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { @@ -1825,17 +1825,18 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } -#ifdef CONFIG_SMP -#define sched_class_highest (&stop_sched_class) -#else -#define sched_class_highest (&dl_sched_class) -#endif +/* Defined in include/asm-generic/vmlinux.lds.h */ +extern struct sched_class __begin_sched_classes[]; +extern struct sched_class __end_sched_classes[]; + +#define sched_class_highest (__end_sched_classes - 1) +#define sched_class_lowest (__begin_sched_classes - 1) #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class = class->next) + for (class = (_from); class != (_to); class--) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, NULL) + for_class_range(class, sched_class_highest, sched_class_lowest) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; -- cgit v1.2.3 From a87e749e8fa1aaef9b4db32e21c2795e69ce67bf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Dec 2019 16:44:54 -0500 Subject: sched: Remove struct sched_class::next field Now that the sched_class descriptors are defined in order via the linker script vmlinux.lds.h, there's no reason to have a "next" pointer to the previous priroity structure. The order of the sturctures can be aligned as an array, and used to index and find the next sched_class descriptor. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191219214558.845353593@goodmis.org --- kernel/sched/deadline.c | 1 - kernel/sched/fair.c | 1 - kernel/sched/idle.c | 1 - kernel/sched/rt.c | 1 - kernel/sched/sched.h | 1 - kernel/sched/stop_task.c | 1 - 6 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d9e79462993b..c9cc1d6fa363 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2481,7 +2481,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, const struct sched_class dl_sched_class __attribute__((section("__dl_sched_class"))) = { - .next = &rt_sched_class, .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3365f6b07c36..a63f400013de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11124,7 +11124,6 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task */ const struct sched_class fair_sched_class __attribute__((section("__fair_sched_class"))) = { - .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f5806295356b..336d478bddc8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -455,7 +455,6 @@ static void update_curr_idle(struct rq *rq) */ const struct sched_class idle_sched_class __attribute__((section("__idle_sched_class"))) = { - /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6543d4430331..f215eea6a966 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2431,7 +2431,6 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) const struct sched_class rt_sched_class __attribute__((section("__rt_sched_class"))) = { - .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4165c06d1d7b..549e7e6e0a66 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1754,7 +1754,6 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) struct sched_class { - const struct sched_class *next; #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index f4bbd54caae0..394bc8126a1e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -111,7 +111,6 @@ static void update_curr_stop(struct rq *rq) */ const struct sched_class stop_sched_class __attribute__((section("__stop_sched_class"))) = { - .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, -- cgit v1.2.3 From aa93cd53bc1b91b5f99c7b55e3dcc1ac98e99558 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 19 Dec 2019 16:44:55 -0500 Subject: sched: Micro optimization in pick_next_task() and in check_preempt_curr() This introduces an optimization based on xxx_sched_class addresses in two hot scheduler functions: pick_next_task() and check_preempt_curr(). It is possible to compare pointers to sched classes to check, which of them has a higher priority, instead of current iterations using for_each_class(). One more result of the patch is that size of object file becomes a little less (excluding added BUG_ON(), which goes in __init section): $size kernel/sched/core.o text data bss dec hex filename before: 66446 18957 676 86079 1503f kernel/sched/core.o after: 66398 18957 676 86031 1500f kernel/sched/core.o Signed-off-by: Kirill Tkhai Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/711a9c4b-ff32-1136-b848-17c622d548f3@yandex.ru --- kernel/sched/core.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81640fe0eae8..0a08d0c235d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1412,20 +1412,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { + if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_curr(rq); - break; - } - } - } + else if (p->sched_class > rq->curr->sched_class) + resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In @@ -4003,8 +3993,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ - if (likely((prev->sched_class == &idle_sched_class || - prev->sched_class == &fair_sched_class) && + if (likely(prev->sched_class <= &fair_sched_class && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); -- cgit v1.2.3 From 423d02e1463b21109106f52d94f7396b63731f3b Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Tue, 16 Jun 2020 14:04:07 +0800 Subject: sched/fair: Optimize dequeue_task_fair() While looking at enqueue_task_fair and dequeue_task_fair, it occurred to me that dequeue_task_fair can also be optimized as Vincent described in commit 7d148be69e3a ("sched/fair: Optimize enqueue_task_fair()"). When encountering throttled cfs_rq, dequeue_throttle label can ensure se not to be NULL, and rq->nr_running remains unchanged, so we can also skip the early balance check. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/701eef9a40de93dcf5fe7063fd607bca5db38e05.1592287263.git.rocking@linux.alibaba.com --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a63f400013de..b9b9f19e80c1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5624,14 +5624,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } -dequeue_throttle: - if (!se) - sub_nr_running(rq, 1); + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, 1); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; +dequeue_throttle: util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } -- cgit v1.2.3 From 10e8b11eb3195e11450c509d4dd3984d707a4167 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 25 Jun 2020 13:52:53 +0200 Subject: cpuidle: Rearrange s2idle-specific idle state entry code Implement call_cpuidle_s2idle() in analogy with call_cpuidle() for the s2idle-specific idle state entry and invoke it from cpuidle_idle_call() to make the s2idle-specific idle entry code path look more similar to the "regular" idle entry one. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Chen Yu --- kernel/sched/idle.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 05deb81bb3e3..1ae95b9150d3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -96,6 +96,15 @@ void __cpuidle default_idle_call(void) } } +static int call_cpuidle_s2idle(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + if (current_clr_polling_and_test()) + return -EBUSY; + + return cpuidle_enter_s2idle(drv, dev); +} + static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, int next_state) { @@ -171,11 +180,9 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle()) { rcu_idle_enter(); - entered_state = cpuidle_enter_s2idle(drv, dev); - if (entered_state > 0) { - local_irq_enable(); + entered_state = call_cpuidle_s2idle(drv, dev); + if (entered_state > 0) goto exit_idle; - } rcu_idle_exit(); -- cgit v1.2.3 From bba18a1af33e0f0a1ad7d028208b306ef3f3df12 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 18 Jun 2020 19:47:50 +0300 Subject: console: Propagate error code from console ->setup() Since console ->setup() hook returns meaningful error codes, propagate it to the caller of try_enable_new_console(). Signed-off-by: Andy Shevchenko Reviewed-by: Petr Mladek Acked-by: Benjamin Herrenschmidt Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200618164751.56828-6-andriy.shevchenko@linux.intel.com --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8c14835be46c..aaea3ad182e1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2668,7 +2668,7 @@ early_param("keep_bootcon", keep_bootcon_setup); static int try_enable_new_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; - int i; + int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; @@ -2691,8 +2691,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) return 0; if (newcon->setup && - newcon->setup(newcon, c->options) != 0) - return -EIO; + (err = newcon->setup(newcon, c->options)) != 0) + return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) { -- cgit v1.2.3 From 504603767ab639c47912be08d62e02b43125e82e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 18 Jun 2020 19:47:51 +0300 Subject: console: Fix trivia typo 'change' -> 'chance' I bet the word 'chance' has to be used in 'had a chance to be called', but, alas, I'm not native speaker... Signed-off-by: Andy Shevchenko Reviewed-by: Petr Mladek Acked-by: Benjamin Herrenschmidt Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200618164751.56828-7-andriy.shevchenko@linux.intel.com --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index aaea3ad182e1..6623e975675a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2705,7 +2705,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() - * and setup() had a change to be called. + * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; -- cgit v1.2.3 From a9b59159d338d414acaa8e2f569d129d51c76452 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 24 Jun 2020 15:20:39 -0700 Subject: bpf: Do not allow btf_ctx_access with __int128 types To ensure btf_ctx_access() is safe the verifier checks that the BTF arg type is an int, enum, or pointer. When the function does the BTF arg lookup it uses the calculation 'arg = off / 8' using the fact that registers are 8B. This requires that the first arg is in the first reg, the second in the second, and so on. However, for __int128 the arg will consume two registers by default LLVM implementation. So this will cause the arg layout assumed by the 'arg = off / 8' calculation to be incorrect. Because __int128 is uncommon this patch applies the easiest fix and will force int types to be sizeof(u64) or smaller so that they will fit in a single register. v2: remove unneeded parens per Andrii's feedback Fixes: 9e15db66136a1 ("bpf: Implement accurate raw_tp context access via BTF") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159303723962.11287.13309537171132420717.stgit@john-Precision-5820-Tower --- kernel/bpf/btf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58c9af1d4808..9a1a98dd9e97 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3746,7 +3746,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!btf_type_is_int(t)) { + if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3768,7 +3768,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { -- cgit v1.2.3 From b58e733fd774f3f4b49d9e7640d172a57e35200e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2020 18:24:27 +0200 Subject: rcu: Fixup noinstr warnings A KCSAN build revealed we have explicit annoations through atomic_*() usage, switch to arch_atomic_*() for the respective functions. vmlinux.o: warning: objtool: rcu_nmi_exit()+0x4d: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_dynticks_eqs_enter()+0x25: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_nmi_enter()+0x4f: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_dynticks_eqs_exit()+0x2a: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: __rcu_is_watching()+0x25: call to __kcsan_check_access() leaves .noinstr.text section Additionally, without the NOP in instrumentation_begin(), objtool would not detect the lack of the 'else instrumentation_begin();' branch in rcu_nmi_enter(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c716eadc7617..6c6569e0586c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -250,7 +250,7 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); @@ -274,13 +274,13 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { - atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); + arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ } } @@ -313,7 +313,7 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* @@ -633,6 +633,10 @@ static noinstr void rcu_eqs_enter(bool user) do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... @@ -692,6 +696,7 @@ noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + instrumentation_begin(); /* * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. * (We are exiting an NMI handler, so RCU better be paying attention @@ -705,7 +710,6 @@ noinstr void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - instrumentation_begin(); trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ @@ -714,13 +718,15 @@ noinstr void rcu_nmi_exit(void) return; } - instrumentation_begin(); /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (!in_nmi()) rcu_prepare_for_idle(); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); instrumentation_end(); // RCU is watching here ... @@ -838,6 +844,10 @@ static void noinstr rcu_eqs_exit(bool user) rcu_dynticks_eqs_exit(); // ... but is watching here. instrumentation_begin(); + + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); @@ -983,13 +993,21 @@ noinstr void rcu_nmi_enter(void) if (!in_nmi()) rcu_cleanup_after_idle(); + instrumentation_begin(); + // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + incby = 1; } else if (!in_nmi()) { instrumentation_begin(); rcu_irq_enter_check_tick(); instrumentation_end(); + } else { + instrumentation_begin(); } - instrumentation_begin(); + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); -- cgit v1.2.3 From c17d1a3a8ee4dac7539d5c976b45d9300f6f10bc Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Tue, 23 Jun 2020 12:12:40 +0800 Subject: fork: annotate data race in copy_process() KCSAN reported data race reading and writing nr_threads and max_threads. The data race is intentional and benign. This is obvious from the comment above it and based on general consensus when discussing this issue. So there's no need for any heavy atomic or *_ONCE() machinery here. In accordance with the newly introduced data_race() annotation consensus, mark the offending line with data_race(). Here it's actually useful not just to silence KCSAN but to also clearly communicate that the race is intentional. This is especially helpful since nr_threads is otherwise protected by tasklist_lock. BUG: KCSAN: data-race in copy_process / copy_process write to 0xffffffff86205cf8 of 4 bytes by task 14779 on cpu 1: copy_process+0x2eba/0x3c40 kernel/fork.c:2273 _do_fork+0xfe/0x7a0 kernel/fork.c:2421 __do_sys_clone kernel/fork.c:2576 [inline] __se_sys_clone kernel/fork.c:2557 [inline] __x64_sys_clone+0x130/0x170 kernel/fork.c:2557 do_syscall_64+0xcc/0x3a0 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffffffff86205cf8 of 4 bytes by task 6944 on cpu 0: copy_process+0x94d/0x3c40 kernel/fork.c:1954 _do_fork+0xfe/0x7a0 kernel/fork.c:2421 __do_sys_clone kernel/fork.c:2576 [inline] __se_sys_clone kernel/fork.c:2557 [inline] __x64_sys_clone+0x130/0x170 kernel/fork.c:2557 do_syscall_64+0xcc/0x3a0 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Link: https://groups.google.com/forum/#!msg/syzkaller-upstream-mo deration/thvp7AHs5Ew/aPdYLXfYBQAJ Reported-by: syzbot+52fced2d288f8ecd2b20@syzkaller.appspotmail.com Signed-off-by: Zefan Li Signed-off-by: Weilong Chen Acked-by: Christian Brauner Cc: Qian Cai Cc: Oleg Nesterov Cc: Christian Brauner Cc: Marco Elver [christian.brauner@ubuntu.com: rewrite commit message] Link: https://lore.kernel.org/r/20200623041240.154294-1-chenweilong@huawei.com Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..efc5493203ae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1977,7 +1977,7 @@ static __latent_entropy struct task_struct *copy_process( * to stop root fork bombs. */ retval = -EAGAIN; - if (nr_threads >= max_threads) + if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ -- cgit v1.2.3 From f3bdc62fd82ed93dbe4d049eacba310de7eb2a6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 17 Jun 2020 15:58:23 +0200 Subject: blktrace: Provide event for request merging Currently blk-mq does not report any event when two requests get merged in the elevator. This then results in difficult to understand sequence of events like: ... 8,0 34 1579 0.608765271 2718 I WS 215023504 + 40 [dbench] 8,0 34 1584 0.609184613 2719 A WS 215023544 + 56 <- (8,4) 2160568 8,0 34 1585 0.609184850 2719 Q WS 215023544 + 56 [dbench] 8,0 34 1586 0.609188524 2719 G WS 215023544 + 56 [dbench] 8,0 3 602 0.609684162 773 D WS 215023504 + 96 [kworker/3:1H] 8,0 34 1591 0.609843593 0 C WS 215023504 + 96 [0] and you can only guess (after quite some headscratching since the above excerpt is intermixed with a lot of other IO) that request 215023544+56 got merged to request 215023504+40. Provide proper event for request merging like we used to do in the legacy block layer. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c086c38f4954..7ba62d68885a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -860,6 +860,13 @@ static void blk_add_trace_rq_issue(void *ignore, blk_trace_request_get_cgid(q, rq)); } +static void blk_add_trace_rq_merge(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, + blk_trace_request_get_cgid(q, rq)); +} + static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) @@ -1144,6 +1151,8 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); + ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); + WARN_ON(ret); ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); @@ -1190,6 +1199,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); -- cgit v1.2.3 From fd7af71be54271a9f03b2e6f63e4b3ac1ecd113d Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Thu, 25 Jun 2020 20:29:27 -0700 Subject: kexec: do not verify the signature without the lockdown or mandatory signature Signature verification is an important security feature, to protect system from being attacked with a kernel of unknown origin. Kexec rebooting is a way to replace the running kernel, hence need be secured carefully. In the current code of handling signature verification of kexec kernel, the logic is very twisted. It mixes signature verification, IMA signature appraising and kexec lockdown. If there is no KEXEC_SIG_FORCE, kexec kernel image doesn't have one of signature, the supported crypto, and key, we don't think this is wrong, Unless kexec lockdown is executed. IMA is considered as another kind of signature appraising method. If kexec kernel image has signature/crypto/key, it has to go through the signature verification and pass. Otherwise it's seen as verification failure, and won't be loaded. Seems kexec kernel image with an unqualified signature is even worse than those w/o signature at all, this sounds very unreasonable. E.g. If people get a unsigned kernel to load, or a kernel signed with expired key, which one is more dangerous? So, here, let's simplify the logic to improve code readability. If the KEXEC_SIG_FORCE enabled or kexec lockdown enabled, signature verification is mandated. Otherwise, we lift the bar for any kernel image. Link: http://lkml.kernel.org/r/20200602045952.27487-1-lijiang@redhat.com Signed-off-by: Lianbo Jiang Reviewed-by: Jiri Bohac Acked-by: Dave Young Acked-by: Baoquan He Cc: James Morris Cc: Matthew Garrett Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bb05fd52de85..09cc78df53c6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image) static int kimage_validate_signature(struct kimage *image) { - const char *reason; int ret; ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, image->kernel_buf_len); - switch (ret) { - case 0: - break; + if (ret) { - /* Certain verification errors are non-fatal if we're not - * checking errors, provided we aren't mandating that there - * must be a valid signature. - */ - case -ENODATA: - reason = "kexec of unsigned image"; - goto decide; - case -ENOPKG: - reason = "kexec of image with unsupported crypto"; - goto decide; - case -ENOKEY: - reason = "kexec of image with unavailable key"; - decide: if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { - pr_notice("%s rejected\n", reason); + pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } - /* If IMA is guaranteed to appraise a signature on the kexec + /* + * If IMA is guaranteed to appraise a signature on the kexec * image, permit it even if the kernel is otherwise locked * down. */ @@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image) security_locked_down(LOCKDOWN_KEXEC)) return -EPERM; - return 0; - - /* All other errors are fatal, including nomem, unparseable - * signatures and signature check failures - even if signatures - * aren't required. - */ - default: - pr_notice("kernel signature verification failed (%d).\n", ret); + pr_debug("kernel signature verification failed (%d).\n", ret); } - return ret; + return 0; } #endif -- cgit v1.2.3 From 7a0e27b2a0ce2735e27e21ebc8b777550fe0ed81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jun 2020 20:30:47 -0700 Subject: mm: remove vmalloc_exec Merge vmalloc_exec into its only caller. Note that for !CONFIG_MMU __vmalloc_node_range maps to __vmalloc, which directly clears the __GFP_HIGHMEM added by the vmalloc_exec stub anyway. Link: http://lkml.kernel.org/r/20200618064307.32739-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Dexuan Cui Cc: Jessica Yu Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..0c6573b98c36 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2783,7 +2783,9 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return vmalloc_exec(size); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __func__); } bool __weak module_init_section(const char *name) -- cgit v1.2.3 From 5946d1f5b309381805bad3ddc3054c04f4ae9c24 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Thu, 4 Jun 2020 15:31:19 +0530 Subject: kdb: Switch to use safer dbg_io_ops over console APIs In kgdb context, calling console handlers aren't safe due to locks used in those handlers which could in turn lead to a deadlock. Although, using oops_in_progress increases the chance to bypass locks in most console handlers but it might not be sufficient enough in case a console uses more locks (VT/TTY is good example). Currently when a driver provides both polling I/O and a console then kdb will output using the console. We can increase robustness by using the currently active polling I/O driver (which should be lockless) instead of the corresponding console. For several common cases (e.g. an embedded system with a single serial port that is used both for console output and debugger I/O) this will result in no console handler being used. In order to achieve this we need to reverse the order of preference to use dbg_io_ops (uses polling I/O mode) over console APIs. So we just store "struct console" that represents debugger I/O in dbg_io_ops and while emitting kdb messages, skip console that matches dbg_io_ops console in order to avoid duplicate messages. After this change, "is_console" param becomes redundant and hence removed. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Link: https://lore.kernel.org/r/1591264879-25920-5-git-send-email-sumit.garg@linaro.org Reviewed-by: Douglas Anderson Reviewed-by: Petr Mladek Acked-by: Greg Kroah-Hartman Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0e4f2eda96d8..683a799618ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -549,7 +549,7 @@ static void kdb_msg_write(const char *msg, int msg_len) if (msg_len == 0) return; - if (dbg_io_ops && !dbg_io_ops->is_console) { + if (dbg_io_ops) { const char *cp = msg; int len = msg_len; @@ -562,6 +562,8 @@ static void kdb_msg_write(const char *msg, int msg_len) for_each_console(c) { if (!(c->flags & CON_ENABLED)) continue; + if (c == dbg_io_ops->cons) + continue; /* * Set oops_in_progress to encourage the console drivers to * disregard their internal spin locks: in the current calling -- cgit v1.2.3 From 440ab9e10e2e6e5fd677473ee6f9e3af0f6904d6 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 2 Jun 2020 15:47:39 -0700 Subject: kgdb: Avoid suspicious RCU usage warning At times when I'm using kgdb I see a splat on my console about suspicious RCU usage. I managed to come up with a case that could reproduce this that looked like this: WARNING: suspicious RCU usage 5.7.0-rc4+ #609 Not tainted ----------------------------- kernel/pid.c:395 find_task_by_pid_ns() needs rcu_read_lock() protection! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by swapper/0/1: #0: ffffff81b6b8e988 (&dev->mutex){....}-{3:3}, at: __device_attach+0x40/0x13c #1: ffffffd01109e9e8 (dbg_master_lock){....}-{2:2}, at: kgdb_cpu_enter+0x20c/0x7ac #2: ffffffd01109ea90 (dbg_slave_lock){....}-{2:2}, at: kgdb_cpu_enter+0x3ec/0x7ac stack backtrace: CPU: 7 PID: 1 Comm: swapper/0 Not tainted 5.7.0-rc4+ #609 Hardware name: Google Cheza (rev3+) (DT) Call trace: dump_backtrace+0x0/0x1b8 show_stack+0x1c/0x24 dump_stack+0xd4/0x134 lockdep_rcu_suspicious+0xf0/0x100 find_task_by_pid_ns+0x5c/0x80 getthread+0x8c/0xb0 gdb_serial_stub+0x9d4/0xd04 kgdb_cpu_enter+0x284/0x7ac kgdb_handle_exception+0x174/0x20c kgdb_brk_fn+0x24/0x30 call_break_hook+0x6c/0x7c brk_handler+0x20/0x5c do_debug_exception+0x1c8/0x22c el1_sync_handler+0x3c/0xe4 el1_sync+0x7c/0x100 rpmh_rsc_probe+0x38/0x420 platform_drv_probe+0x94/0xb4 really_probe+0x134/0x300 driver_probe_device+0x68/0x100 __device_attach_driver+0x90/0xa8 bus_for_each_drv+0x84/0xcc __device_attach+0xb4/0x13c device_initial_probe+0x18/0x20 bus_probe_device+0x38/0x98 device_add+0x38c/0x420 If I understand properly we should just be able to blanket kgdb under one big RCU read lock and the problem should go away. We'll add it to the beast-of-a-function known as kgdb_cpu_enter(). With this I no longer get any splats and things seem to work fine. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200602154729.v2.1.I70e0d4fd46d5ed2aaf0c98a355e8e1b7a5bb7e4e@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index bc8d25f2ac8a..9e5934780f41 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -587,6 +587,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, arch_kgdb_ops.disable_hw_break(regs); acquirelock: + rcu_read_lock(); /* * Interrupts will be restored by the 'trap return' code, except when * single stepping. @@ -646,6 +647,7 @@ return_normal: atomic_dec(&slaves_in_kgdb); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); return 0; } cpu_relax(); @@ -664,6 +666,7 @@ return_normal: raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); goto acquirelock; } @@ -787,6 +790,7 @@ kgdb_restore: raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); return kgdb_info[cpu].ret_state; } -- cgit v1.2.3 From 985098a05eee6bf5caca7e997e02a5b15242cfa0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 23 Jun 2020 09:09:10 +0200 Subject: docs: fix references for DMA*.txt files As we moved those files to core-api, fix references to point to their newer locations. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/37b2fd159fbc7655dbf33b3eb1215396a25f6344.1592895969.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 36c962a86bf2..f97f088ace7e 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1071,7 +1071,7 @@ static void check_unmap(struct dma_debug_entry *ref) /* * Drivers should use dma_mapping_error() to check the returned * addresses of dma_map_single() and dma_map_page(). - * If not, print this warning message. See Documentation/DMA-API.txt. + * If not, print this warning message. See Documentation/core-api/dma-api.rst. */ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { err_printk(ref->dev, entry, -- cgit v1.2.3 From fd844ba9ae59b51e34e77105d79f8eca780b3bd6 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 17 Jun 2020 14:17:42 +0200 Subject: sched/core: Check cpus_mask, not cpus_ptr in __set_cpus_allowed_ptr(), to fix mask corruption This function is concerned with the long-term CPU mask, not the transitory mask the task might have while migrate disabled. Before this patch, if a task was migrate-disabled at the time __set_cpus_allowed_ptr() was called, and the new mask happened to be equal to the CPU that the task was running on, then the mask update would be lost. Signed-off-by: Scott Wood Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200617121742.cpxppyi7twxmpin7@linutronix.de --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f360326861e..9eeac94224db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1637,7 +1637,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(p->cpus_ptr, new_mask)) + if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; /* -- cgit v1.2.3 From ce9bc3b27f2a21a7969b41ffb04df8cf61bd1592 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 17 Jun 2020 09:29:19 +0200 Subject: sched/deadline: Initialize ->dl_boosted syzbot reported the following warning triggered via SYSC_sched_setattr(): WARNING: CPU: 0 PID: 6973 at kernel/sched/deadline.c:593 setup_new_dl_entity /kernel/sched/deadline.c:594 [inline] WARNING: CPU: 0 PID: 6973 at kernel/sched/deadline.c:593 enqueue_dl_entity /kernel/sched/deadline.c:1370 [inline] WARNING: CPU: 0 PID: 6973 at kernel/sched/deadline.c:593 enqueue_task_dl+0x1c17/0x2ba0 /kernel/sched/deadline.c:1441 This happens because the ->dl_boosted flag is currently not initialized by __dl_clear_params() (unlike the other flags) and setup_new_dl_entity() rightfully complains about it. Initialize dl_boosted to 0. Fixes: 2d3d891d3344 ("sched/deadline: Add SCHED_DEADLINE inheritance logic") Reported-by: syzbot+5ac8bac25f95e8b221e7@syzkaller.appspotmail.com Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Daniel Wagner Link: https://lkml.kernel.org/r/20200617072919.818409-1-juri.lelli@redhat.com --- kernel/sched/deadline.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 504d2f51b0d6..f63f337c7147 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2692,6 +2692,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; + dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; -- cgit v1.2.3 From 740797ce3a124b7dd22b7fb832d87bc8fba1cf6f Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 19 Nov 2018 16:32:01 +0100 Subject: sched/core: Fix PI boosting between RT and DEADLINE tasks syzbot reported the following warning: WARNING: CPU: 1 PID: 6351 at kernel/sched/deadline.c:628 enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504 At deadline.c:628 we have: 623 static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) 624 { 625 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 626 struct rq *rq = rq_of_dl_rq(dl_rq); 627 628 WARN_ON(dl_se->dl_boosted); 629 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); [...] } Which means that setup_new_dl_entity() has been called on a task currently boosted. This shouldn't happen though, as setup_new_dl_entity() is only called when the 'dynamic' deadline of the new entity is in the past w.r.t. rq_clock and boosted tasks shouldn't verify this condition. Digging through the PI code I noticed that what above might in fact happen if an RT tasks blocks on an rt_mutex hold by a DEADLINE task. In the first branch of boosting conditions we check only if a pi_task 'dynamic' deadline is earlier than mutex holder's and in this case we set mutex holder to be dl_boosted. However, since RT 'dynamic' deadlines are only initialized if such tasks get boosted at some point (or if they become DEADLINE of course), in general RT 'dynamic' deadlines are usually equal to 0 and this verifies the aforementioned condition. Fix it by checking that the potential donor task is actually (even if temporary because in turn boosted) running at DEADLINE priority before using its 'dynamic' deadline value. Fixes: 2d3d891d3344 ("sched/deadline: Add SCHED_DEADLINE inheritance logic") Reported-by: syzbot+119ba87189432ead09b4@syzkaller.appspotmail.com Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Daniel Bristot de Oliveira Tested-by: Daniel Wagner Link: https://lkml.kernel.org/r/20181119153201.GB2119@localhost.localdomain --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9eeac94224db..c1ba2e569fc9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4533,7 +4533,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) */ if (dl_prio(prio)) { if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; queue_flag |= ENQUEUE_REPLENISH; } else -- cgit v1.2.3 From b6e13e85829f032411b896bd2f0d6cbe4b0a3c4a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:23 +0200 Subject: sched/core: Fix ttwu() race Paul reported rcutorture occasionally hitting a NULL deref: sched_ttwu_pending() ttwu_do_wakeup() check_preempt_curr() := check_preempt_wakeup() find_matching_se() is_same_group() if (se->cfs_rq == pse->cfs_rq) <-- *BOOM* Debugging showed that this only appears to happen when we take the new code-path from commit: 2ebb17717550 ("sched/core: Offload wakee task activation if it the wakee is descheduling") and only when @cpu == smp_processor_id(). Something which should not be possible, because p->on_cpu can only be true for remote tasks. Similarly, without the new code-path from commit: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") this would've unconditionally hit: smp_cond_load_acquire(&p->on_cpu, !VAL); and if: 'cpu == smp_processor_id() && p->on_cpu' is possible, this would result in an instant live-lock (with IRQs disabled), something that hasn't been reported. The NULL deref can be explained however if the task_cpu(p) load at the beginning of try_to_wake_up() returns an old value, and this old value happens to be smp_processor_id(). Further assume that the p->on_cpu load accurately returns 1, it really is still running, just not here. Then, when we enqueue the task locally, we can crash in exactly the observed manner because p->se.cfs_rq != rq->cfs_rq, because p's cfs_rq is from the wrong CPU, therefore we'll iterate into the non-existant parents and NULL deref. The closest semi-plausible scenario I've managed to contrive is somewhat elaborate (then again, actual reproduction takes many CPU hours of rcutorture, so it can't be anything obvious): X->cpu = 1 rq(1)->curr = X CPU0 CPU1 CPU2 // switch away from X LOCK rq(1)->lock smp_mb__after_spinlock dequeue_task(X) X->on_rq = 9 switch_to(Z) X->on_cpu = 0 UNLOCK rq(1)->lock // migrate X to cpu 0 LOCK rq(1)->lock dequeue_task(X) set_task_cpu(X, 0) X->cpu = 0 UNLOCK rq(1)->lock LOCK rq(0)->lock enqueue_task(X) X->on_rq = 1 UNLOCK rq(0)->lock // switch to X LOCK rq(0)->lock smp_mb__after_spinlock switch_to(X) X->on_cpu = 1 UNLOCK rq(0)->lock // X goes sleep X->state = TASK_UNINTERRUPTIBLE smp_mb(); // wake X ttwu() LOCK X->pi_lock smp_mb__after_spinlock if (p->state) cpu = X->cpu; // =? 1 smp_rmb() // X calls schedule() LOCK rq(0)->lock smp_mb__after_spinlock dequeue_task(X) X->on_rq = 0 if (p->on_rq) smp_rmb(); if (p->on_cpu && ttwu_queue_wakelist(..)) [*] smp_cond_load_acquire(&p->on_cpu, !VAL) cpu = select_task_rq(X, X->wake_cpu, ...) if (X->cpu != cpu) switch_to(Y) X->on_cpu = 0 UNLOCK rq(0)->lock However I'm having trouble convincing myself that's actually possible on x86_64 -- after all, every LOCK implies an smp_mb() there, so if ttwu observes ->state != RUNNING, it must also observe ->cpu != 1. (Most of the previous ttwu() races were found on very large PowerPC) Nevertheless, this fully explains the observed failure case. Fix it by ordering the task_cpu(p) load after the p->on_cpu load, which is easy since nothing actually uses @cpu before this. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Paul E. McKenney Tested-by: Paul E. McKenney Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200622125649.GC576871@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1ba2e569fc9..60791b991b21 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2293,8 +2293,15 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) + llist_for_each_entry_safe(p, t, llist, wake_entry) { + if (WARN_ON_ONCE(p->on_cpu)) + smp_cond_load_acquire(&p->on_cpu, !VAL); + + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) + set_task_cpu(p, cpu_of(rq)); + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); + } rq_unlock_irqrestore(rq, &rf); } @@ -2378,6 +2385,9 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + if (WARN_ON_ONCE(cpu == smp_processor_id())) + return false; + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -2528,7 +2538,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; success = 1; - cpu = task_cpu(p); trace_sched_waking(p); p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -2550,7 +2559,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* We're going to change ->state: */ success = 1; - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2614,8 +2622,21 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * which potentially sends an IPI instead of spinning on p->on_cpu to * let the waker make forward progress. This is safe because IRQs are * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. */ - if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_RQ)) goto unlock; /* @@ -2635,6 +2656,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } +#else + cpu = task_cpu(p); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2642,7 +2665,7 @@ unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: if (success) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, task_cpu(p), wake_flags); preempt_enable(); return success; -- cgit v1.2.3 From 739f70b476cf05c5a424b42a8b5728914345610c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:24 +0200 Subject: sched/core: s/WF_ON_RQ/WQ_ON_CPU/ Use a better name for this poorly named flag, to avoid confusion... Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20200622100825.785115830@infradead.org --- kernel/sched/core.c | 4 ++-- kernel/sched/sched.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 60791b991b21..f778067de277 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2376,7 +2376,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) * the soon-to-be-idle CPU as the current CPU is likely busy. * nr_running is checked to avoid unnecessary task stacking. */ - if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) return true; return false; @@ -2636,7 +2636,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * scheduling. */ if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_RQ)) + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) goto unlock; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d4e94c1e5fe..877fb08eb1b0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1682,7 +1682,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ -#define WF_ON_RQ 0x08 /* Wakee is on_rq */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution -- cgit v1.2.3 From 8c4890d1c3358fb8023d46e1e554c41d54f02878 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:25 +0200 Subject: smp, irq_work: Continue smp_call_function*() and irq_work*() integration Instead of relying on BUG_ON() to ensure the various data structures line up, use a bunch of horrible unions to make it all automatic. Much of the union magic is to ensure irq_work and smp_call_function do not (yet) see the members of their respective data structures change name. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20200622100825.844455025@infradead.org --- kernel/sched/core.c | 6 +++--- kernel/smp.c | 18 ------------------ 2 files changed, 3 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f778067de277..ca5db40392d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2293,7 +2293,7 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) { + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { if (WARN_ON_ONCE(p->on_cpu)) smp_cond_load_acquire(&p->on_cpu, !VAL); @@ -2329,7 +2329,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - __smp_call_single_queue(cpu, &p->wake_entry); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2786,7 +2786,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP - p->wake_entry_type = CSD_TYPE_TTWU; + p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif } diff --git a/kernel/smp.c b/kernel/smp.c index 472c2b274c65..aa17eedff5be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,24 +669,6 @@ void __init smp_init(void) { int num_nodes, num_cpus; - /* - * Ensure struct irq_work layout matches so that - * flush_smp_call_function_queue() can do horrible things. - */ - BUILD_BUG_ON(offsetof(struct irq_work, llnode) != - offsetof(struct __call_single_data, llist)); - BUILD_BUG_ON(offsetof(struct irq_work, func) != - offsetof(struct __call_single_data, func)); - BUILD_BUG_ON(offsetof(struct irq_work, flags) != - offsetof(struct __call_single_data, flags)); - - /* - * Assert the CSD_TYPE_TTWU layout is similar enough - * for task_struct to be on the @call_single_queue. - */ - BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != - offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); - idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From e21cf43406a190adfcc4bfe592768066fb3aaa9b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 24 Jun 2020 17:44:22 +0200 Subject: sched/cfs: change initial value of runnable_avg Some performance regression on reaim benchmark have been raised with commit 070f5e860ee2 ("sched/fair: Take into account runnable_avg to classify group") The problem comes from the init value of runnable_avg which is initialized with max value. This can be a problem if the newly forked task is finally a short task because the group of CPUs is wrongly set to overloaded and tasks are pulled less agressively. Set initial value of runnable_avg equals to util_avg to reflect that there is no waiting time so far. Fixes: 070f5e860ee2 ("sched/fair: Take into account runnable_avg to classify group") Reported-by: kernel test robot Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200624154422.29166-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbcb2f71599b..658aa7a2ae6f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct task_struct *p) } } - sa->runnable_avg = cpu_scale; + sa->runnable_avg = sa->util_avg; if (p->sched_class != &fair_sched_class) { /* -- cgit v1.2.3 From 71cdec4fab76667dabdbb2ca232b039004ebd40f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sun, 21 Jun 2020 13:43:02 -0700 Subject: dma-mapping: warn when coherent pool is depleted When a DMA coherent pool is depleted, allocation failures may or may not get reported in the kernel log depending on the allocator. The admin does have a workaround, however, by using coherent_pool= on the kernel command line. Provide some guidance on the failure and a recommended minimum size for the pools (double the size). Signed-off-by: David Rientjes Tested-by: Guenter Roeck Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 8cfa01243ed2..39ca26fa41b5 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -239,12 +239,16 @@ void *dma_alloc_from_pool(struct device *dev, size_t size, } val = gen_pool_alloc(pool, size); - if (val) { + if (likely(val)) { phys_addr_t phys = gen_pool_virt_to_phys(pool, val); *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; memset(ptr, 0, size); + } else { + WARN_ONCE(1, "DMA coherent pool depleted, increase size " + "(recommended min coherent_pool=%zuK)\n", + gen_pool_size(pool) >> 9); } if (gen_pool_avail(pool) < atomic_pool_size) schedule_work(&atomic_pool_work); -- cgit v1.2.3 From c791cc4b1feb881b3644c059dba5464b076bf592 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Jun 2020 14:53:55 -0400 Subject: tracing: Only allow trace_array_printk() to be used by instances To prevent default "trace_printks()" from spamming the top level tracing ring buffer, only allow trace instances to use trace_array_printk() (which can be used without the trace_printk() start up warning). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..8241d1448d70 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3346,12 +3346,16 @@ int trace_array_printk(struct trace_array *tr, int ret; va_list ap; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) - return 0; - if (!tr) return -ENOENT; + /* This is only allowed for created instances */ + if (tr == &global_trace) + return 0; + + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + return 0; + va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); -- cgit v1.2.3 From 7582f30cc9fbcdcd630398bea010e5f6a2e2fcab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:56 +0200 Subject: cgroup: unexport cgroup_rstat_updated cgroup_rstat_updated is only used by core block code, no need to export it. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/cgroup/rstat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b6397a186ce9..d51175cedfca 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -64,7 +64,6 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) raw_spin_unlock_irqrestore(cpu_lock, flags); } -EXPORT_SYMBOL_GPL(cgroup_rstat_updated); /** * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree -- cgit v1.2.3 From 5da7cd11d0811c35a6988d416053b5421bc61521 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 22 Apr 2020 12:25:41 -0400 Subject: x86/ftrace: Only have the builtin ftrace_regs_caller call direct hooks If a direct hook is attached to a function that ftrace also has a function attached to it, then it is required that the ftrace_ops_list_func() is used to iterate over the registered ftrace callbacks. This will also include the direct ftrace_ops helper, that tells ftrace_regs_caller where to return to (the direct callback and not the function that called it). As this direct helper is only to handle the case of ftrace callbacks attached to the same function as the direct callback, the ftrace callback allocated trampolines (used to only call them), should never be used to return back to a direct callback. Only copy the portion of the ftrace_regs_caller that will return back to what called it, and not the portion that returns back to the direct caller. The direct ftrace_ops must then pick the ftrace_regs_caller builtin function as its own trampoline to ensure that it will never have one allocated for it (which would not include the handling of direct callbacks). Link: http://lkml.kernel.org/r/20200422162750.495903799@goodmis.org Cc: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1903b80db6eb..c141d347f71a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2388,6 +2388,14 @@ struct ftrace_ops direct_ops = { .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, + /* + * By declaring the main trampoline as this trampoline + * it will never have one allocated for it. Allocated + * trampolines should not call direct functions. + * The direct_ops should only be called by the builtin + * ftrace_regs_caller trampoline. + */ + .trampoline = FTRACE_REGS_ADDR, }; #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ -- cgit v1.2.3 From 43cb5451dffe0bc5d59688d4898c9a1f7c40d3b4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:06 +0200 Subject: docs: RCU: Convert torture.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..8205295fc33e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -7,7 +7,7 @@ * Authors: Paul E. McKenney * Josh Triplett * - * See also: Documentation/RCU/torture.txt + * See also: Documentation/RCU/torture.rst */ #define pr_fmt(fmt) fmt -- cgit v1.2.3 From f2286ab99549271f3cec73e305b9ecca95d91394 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:10 +0200 Subject: docs: RCU: Convert stallwarn.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Fix list markups; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 54a6dba0280d..b04256cd7e12 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -468,7 +468,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) /* * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); @@ -535,7 +535,7 @@ static void print_cpu_stall(unsigned long gps) /* * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); -- cgit v1.2.3 From 7ee880b7bf1dea88d0a472b775aebdb4fb6bf860 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 15 Apr 2020 22:26:55 +0000 Subject: rcu: Initialize and destroy rcu_synchronize only when necessary The __wait_rcu_gp() function unconditionally initializes and cleans up each element of rs_array[], whether used or not. This is slightly wasteful and rather confusing, so this commit skips both initialization and cleanup for duplicate callback functions. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84843adfd939..f5a82e107bcb 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -390,13 +390,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, might_sleep(); continue; } - init_rcu_head_on_stack(&rs_array[i].head); - init_completion(&rs_array[i].completion); for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } } /* Wait for all callbacks to be invoked. */ @@ -407,9 +408,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { wait_for_completion(&rs_array[i].completion); - destroy_rcu_head_on_stack(&rs_array[i].head); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } } EXPORT_SYMBOL_GPL(__wait_rcu_gp); -- cgit v1.2.3 From abfce0414814149f716e1d30da1fb3140d1b3473 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 19 Apr 2020 21:57:15 +0000 Subject: rcu: Simplify the calculation of rcu_state.ncpus There is only 1 bit set in mask, which means that the only difference between oldmask and the new one will be at the position where the bit is set in mask. This commit therefore updates rcu_state.ncpus by checking whether the bit in mask is already set in rnp->expmaskinitnext. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..bef1dc91bfbe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3842,10 +3842,9 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; - int nbits; - unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; + bool newcpu; if (per_cpu(rcu_cpu_started, cpu)) return; @@ -3857,12 +3856,10 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); - oldmask = rnp->expmaskinitnext; + newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); -- cgit v1.2.3 From e816d56fad57ba9817cef6606b12f5e14647c3bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 May 2020 16:49:48 -0700 Subject: rcu: Add callbacks-invoked counters This commit adds a count of the callbacks invoked to the per-CPU rcu_data structure. This count is printed by the show_rcu_gp_kthreads() that is invoked by rcutorture and the RCU CPU stall-warning code. It is also intended for use by drgn. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 1 + kernel/rcu/tree_stall.h | 3 +++ 3 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bef1dc91bfbe..874c831bcc45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2443,6 +2443,7 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); count = -rcl.len; + rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 43991a40b084..9c6f7343bec0 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -171,6 +171,7 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 54a6dba0280d..2768ce6bf657 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -649,6 +649,7 @@ static void check_cpu_stall(struct rcu_data *rdp) */ void show_rcu_gp_kthreads(void) { + unsigned long cbs = 0; int cpu; unsigned long j; unsigned long ja; @@ -690,9 +691,11 @@ void show_rcu_gp_kthreads(void) } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); + cbs += data_race(rdp->n_cbs_invoked); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } + pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -- cgit v1.2.3 From f8466f94685b5bd931384526cf51e090fd2ac706 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 May 2020 19:16:09 -0700 Subject: rcu: Add comment documenting rcu_callback_map's purpose The rcu_callback_map lockdep_map structure was added back in 2013, but its purpose has become obscure. This commit therefore documments that the purpose of rcu_callback map is, in the words of commit 24ef659a857 ("rcu: Provide better diagnostics for blocking in RCU callback functions"), to help lockdep to tie an "inappropriate voluntary context switch back to the fact that the function is being invoked from within a callback." Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f5a82e107bcb..ca17b771ad60 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -279,6 +279,7 @@ struct lockdep_map rcu_sched_lock_map = { }; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +// Tell lockdep when RCU callbacks are being invoked. static struct lock_class_key rcu_callback_key; struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); -- cgit v1.2.3 From 77865dea25c4f45ce0c5bf61a8470af01fccd944 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 15:44:46 -0700 Subject: rcu: Grace-period-kthread related sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() and schedule_timeout_uninterruptible() calls used by RCU's grace-period kthread to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 874c831bcc45..feb31c201dee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1638,7 +1638,7 @@ static void rcu_gp_slow(int delay) if (delay > 0 && !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) - schedule_timeout_uninterruptible(delay); + schedule_timeout_idle(delay); } static unsigned long sleep_duration; @@ -1661,7 +1661,7 @@ static void rcu_gp_torture_wait(void) duration = xchg(&sleep_duration, 0UL); if (duration > 0) { pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); - schedule_timeout_uninterruptible(duration); + schedule_timeout_idle(duration); pr_alert("%s: Wait complete\n", __func__); } } @@ -2727,7 +2727,7 @@ static void rcu_cpu_kthread(unsigned int cpu) } *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } -- cgit v1.2.3 From a9352f72d6a9e8fe4840b9f0d97af8f5a6c52c79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:34:38 -0700 Subject: rcu: Priority-boost-related sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() call used by RCU's priority-boosting kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 352223664ebd..25296c17a30d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg) if (spincnt > 10) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } -- cgit v1.2.3 From f5ca34643bbd84f514bdeee194c45dd1fb066ef2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:36:10 -0700 Subject: rcu: No-CBs-related sleeps to idle priority This commit converts the schedule_timeout_interruptible() call used by RCU's no-CBs grace-period kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 25296c17a30d..982fc5be5269 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_interruptible(1); + schedule_timeout_idle(1); } else if (!needwait_gp) { /* Wait for callbacks to appear. */ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); -- cgit v1.2.3 From 68c2f27e01f61760e6ae76fff9682e1ffe9bacb6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:38:29 -0700 Subject: rcu: Expedited grace-period sleeps to idle priority This commit converts the schedule_timeout_uninterruptible() call used by RCU's expedited grace-period processing to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 72952edad1e4..1888c0eb1216 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -403,7 +403,7 @@ retry_ipi: /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); - schedule_timeout_uninterruptible(1); + schedule_timeout_idle(1); goto retry_ipi; } /* CPU really is offline, so we must report its QS. */ -- cgit v1.2.3 From 360fbbb4897c98971e8955b063c01250817a2191 Mon Sep 17 00:00:00 2001 From: Lihao Liang Date: Thu, 14 May 2020 21:34:34 +0100 Subject: rcu: Update comment from rsp->rcu_gp_seq to rsp->gp_seq Signed-off-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9c6f7343bec0..575745f0a464 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -41,7 +41,7 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq; /* Track rsp->gp_seq. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ @@ -149,7 +149,7 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq; /* Track rsp->gp_seq counter. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ -- cgit v1.2.3 From 3c8920e2dbd1a55f72dc14d656df9d0097cf5c72 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 15 May 2020 02:34:29 +0200 Subject: tick/nohz: Narrow down noise while setting current task's tick dependency Setting a tick dependency on any task, including the case where a task sets that dependency on itself, triggers an IPI to all CPUs. That is of course suboptimal but it had previously not been an issue because it was only used by POSIX CPU timers on nohz_full, which apparently never occurs in latency-sensitive workloads in production. (Or users of such systems are suffering in silence on the one hand or venting their ire on the wrong people on the other.) But RCU now sets a task tick dependency on the current task in order to fix stall issues that can occur during RCU callback processing. Thus, RCU callback processing triggers frequent system-wide IPIs from nohz_full CPUs. This is quite counter-productive, after all, avoiding IPIs is what nohz_full is supposed to be all about. This commit therefore optimizes tasks' self-setting of a task tick dependency by using tick_nohz_full_kick() to avoid the system-wide IPI. Instead, only the execution of the one task is disturbed, which is acceptable given that this disturbance is well down into the noise compared to the degree to which the RCU callback processing itself disturbs execution. Fixes: 6a949b7af82d (rcu: Force on tick when invoking lots of callbacks) Reported-by: Matt Fleming Signed-off-by: Frederic Weisbecker Cc: stable@kernel.org Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e2dc9b8858c..f0199a4ba1ad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. Posix CPU timers need this in order to elapse - * per task timers. + * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { - /* - * We could optimize this with just kicking the target running the task - * if that noise matters for nohz full users. - */ - tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) { + if (tsk == current) { + preempt_disable(); + tick_nohz_full_kick(); + preempt_enable(); + } else { + /* + * Some future tick_nohz_full_kick_task() + * should optimize this. + */ + tick_nohz_full_kick_all(); + } + } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); -- cgit v1.2.3 From 55fbe86ef303bc8ab040e579fba34a750c08200e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2020 15:02:02 -0700 Subject: rcu: Remove initialized but unused rnp from check_slow_task() This commit removes the variable rnp from check_slow_task(), which is defined, assigned to, but not otherwise used. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2768ce6bf657..d203f82a380a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr { */ static bool check_slow_task(struct task_struct *t, void *arg) { - struct rcu_node *rnp; struct rcu_stall_chk_rdr *rscrp = arg; if (task_curr(t)) return false; // It is running, so decline to inspect it. rscrp->nesting = t->rcu_read_lock_nesting; rscrp->rs = t->rcu_read_unlock_special; - rnp = t->rcu_blocked_node; rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); return true; } -- cgit v1.2.3 From 04b25a495bd68c1dad07263fb91e8b5a31c00a9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2020 17:00:54 -0700 Subject: rcu: Mark rcu_nmi_enter() call to rcu_cleanup_after_idle() noinstr The objtool complains about the call to rcu_cleanup_after_idle() from rcu_nmi_enter(), so this commit adds instrumentation_begin() before that call and instrumentation_end() after it. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index feb31c201dee..d17e5a08bf43 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -990,8 +990,11 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) + if (!in_nmi()) { + instrumentation_begin(); rcu_cleanup_after_idle(); + instrumentation_end(); + } instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() -- cgit v1.2.3 From d29e0b26b020422cc51b5b51733cc50fcf443965 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 08:49:29 -0700 Subject: lockdep: Complain only once about RCU in extended quiescent state Currently, lockdep_rcu_suspicious() complains twice about RCU read-side critical sections being invoked from within extended quiescent states, for example: RCU used illegally from idle CPU! rcu_scheduler_active = 2, debug_locks = 1 RCU used illegally from extended quiescent state! This commit therefore saves a couple lines of code and one line of console-log output by eliminating the first of these two complaints. Link: https://lore.kernel.org/lkml/87wo4wnpzb.fsf@nanos.tec.linutronix.de Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..0a7549d159ed 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5851,9 +5851,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", + : "", rcu_scheduler_active, debug_locks); /* -- cgit v1.2.3 From e40bb921119814c6f746891af9cd37eccda616a4 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 1 Jun 2020 19:45:49 +0100 Subject: rcu: Replace 1 with true Coccinelle reports a warning WARNING: Assignment of 0/1 to bool variable The root cause is that the variable lastphase is a bool, but is initialised with integer 1. This commit therefore replaces the 1 with a true. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca17b771ad60..a0ba8858dd35 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -207,7 +207,7 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); - rcu_boot_ended = 1; + rcu_boot_ended = true; } /* -- cgit v1.2.3 From c6dfd72b7a3b70a2054db0f73245ea2f762a8452 Mon Sep 17 00:00:00 2001 From: Peter Enderborg Date: Thu, 4 Jun 2020 12:23:20 +0200 Subject: rcu: Stop shrinker loop The count and scan can be separated in time, and there is a fair chance that all work is already done when the scan starts, which might in turn result in a needless retry. This commit therefore avoids this retry by returning SHRINK_STOP. Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Peter Enderborg Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d17e5a08bf43..c8196fab563c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3332,7 +3332,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - return freed; + return freed == 0 ? SHRINK_STOP : freed; } static struct shrinker kfree_rcu_shrinker = { -- cgit v1.2.3 From 00943a609d7ad0f08e58bc9c214f38b0ba163c88 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:52 +0800 Subject: rcu: gp_max is protected by root rcu_node's lock Because gp_max is protected by root rcu_node's lock, this commit moves the gp_max definition to the region of the rcu_node structure containing fields protected by this lock. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 575745f0a464..09ec93b16f28 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -302,6 +302,8 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ @@ -347,8 +349,6 @@ struct rcu_state { /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ /* GP start. */ - unsigned long gp_max; /* Maximum GP duration in */ - /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ -- cgit v1.2.3 From a2dae43088d51c4869e7fa91ca09bcc890e277fc Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:53 +0800 Subject: rcu: grplo/grphi just records CPU number The ->grplo and ->grphi fields store the lowest and highest CPU number covered by to a rcu_node structure, which is not the group number. This commit therefore adjusts these fields' comments to match reality. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 09ec93b16f28..9f903f5c9fa1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -73,8 +73,8 @@ struct rcu_node { unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ + int grplo; /* lowest-numbered CPU here. */ + int grphi; /* highest-numbered CPU here. */ u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ -- cgit v1.2.3 From 7a0c2b0940c13a06573320ab7118375b35feef8b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:54 +0800 Subject: rcu: grpnum just records group number The ->grpnum field in the rcu_node structure contains the bit position in this structure's parent's bitmasks, which is not the CPU number. This commit therefore adjusts this field's comment accordingly. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9f903f5c9fa1..c96ae351688b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -75,7 +75,7 @@ struct rcu_node { /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU here. */ int grphi; /* highest-numbered CPU here. */ - u8 grpnum; /* CPU/group number for next level up. */ + u8 grpnum; /* group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ /* exit RCU read-side critical sections */ -- cgit v1.2.3 From c3cb47a6cc74af0b79579ba167d7124eb669fbaa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Jun 2020 12:28:05 -0700 Subject: kernel/rcu/tree.c: Fix kernel-doc warnings Fix kernel-doc warning: ../kernel/rcu/tree.c:959: warning: Excess function parameter 'irq' description in 'rcu_nmi_enter' Fixes: cf7614e13c8f ("rcu: Refactor rcu_{nmi,irq}_{enter,exit}()") Signed-off-by: Randy Dunlap Cc: Byungchul Park Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c8196fab563c..ef05aac7f9d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -954,7 +954,6 @@ void __rcu_irq_enter_check_tick(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context - * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know -- cgit v1.2.3 From 8e11690d2f5a9823d66f68918c3986b4e9e160ab Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 4 May 2020 14:35:00 +0200 Subject: rcu: Fix a kernel-doc warnings for "count" There are some kernel-doc warnings: ./kernel/rcu/tree.c:2915: warning: Function parameter or member 'count' not described in 'kfree_rcu_cpu' This commit therefore moves the comment for "count" to the kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..ba4c477495b5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3004,6 +3004,7 @@ struct kfree_rcu_cpu_work { * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @lock and @rcu_work fields have been initialized + * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3019,7 +3020,6 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; - // Number of objects for which GP not started int count; }; -- cgit v1.2.3 From 8ac88f7177c75bf9b7b8c29a8054115e1c712baf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:45 +0200 Subject: rcu/tree: Keep kfree_rcu() awake during lock contention On PREEMPT_RT kernels, the krcp spinlock gets converted to an rt-mutex and causes kfree_rcu() callers to sleep. This makes it unusable for callers in purely atomic sections such as non-threaded IRQ handlers and raw spinlock sections. Fix it by converting the spinlock to a raw spinlock. Vetting all code paths, there is no reason to believe that the raw spinlock will hurt RT latencies as it is not held for a long time. Cc: bigeasy@linutronix.de Cc: Uladzislau Rezki Reviewed-by: Uladzislau Rezki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ba4c477495b5..c5de5adca0dd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3016,7 +3016,7 @@ struct kfree_rcu_cpu { struct kfree_rcu_bulk_data *bhead; struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - spinlock_t lock; + raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; @@ -3049,12 +3049,12 @@ static void kfree_rcu_work(struct work_struct *work) krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); head = krwp->head_free; krwp->head_free = NULL; bhead = krwp->bhead_free; krwp->bhead_free = NULL; - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); /* "bhead" is now private, so traverse locklessly. */ for (; bhead; bhead = bnext) { @@ -3157,14 +3157,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); return; } // Previous RCU batch still in progress, try again later. krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } /* @@ -3177,11 +3177,11 @@ static void kfree_rcu_monitor(struct work_struct *work) struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, monitor_work.work); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline bool @@ -3252,7 +3252,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) - spin_lock(&krcp->lock); + raw_spin_lock(&krcp->lock); // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(head)) { @@ -3283,7 +3283,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: if (krcp->initialized) - spin_unlock(&krcp->lock); + raw_spin_unlock(&krcp->lock); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -3315,11 +3315,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count = krcp->count; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); sc->nr_to_scan -= count; freed += count; @@ -3346,15 +3346,15 @@ void __init kfree_rcu_scheduler_running(void) for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (!krcp->head || krcp->monitor_todo) { - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } krcp->monitor_todo = true; schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -4250,7 +4250,7 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_init(&krcp->lock); + raw_spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; -- cgit v1.2.3 From 4d2919411867848fab78c7cb13139e17ad8b85bc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:46 +0200 Subject: rcu/tree: Skip entry into the page allocator for PREEMPT_RT To keep the kfree_rcu() code working in purely atomic sections on RT, such as non-threaded IRQ handlers and raw spinlock sections, avoid calling into the page allocator which uses sleeping locks on RT. In fact, even if the caller is preemptible, the kfree_rcu() code is not, as the krcp->lock is a raw spinlock. Calling into the page allocator is optional and avoiding it should be Ok, especially with the page pre-allocation support in future patches. Such pre-allocation would further avoid the a need for a dynamically allocated page in the first place. Cc: Sebastian Andrzej Siewior Reviewed-by: Uladzislau Rezki Co-developed-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c5de5adca0dd..e0425faf3b3b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3202,6 +3202,18 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + /* + * To keep this path working on raw non-preemptible + * sections, prevent the optional entry into the + * allocator as it uses sleeping locks. In fact, even + * if the caller of kfree_rcu() is preemptible, this + * path still is not, as krcp->lock is a raw spinlock. + * With additional page pre-allocation in the works, + * hitting this return is going to be much less likely. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + bnode = (struct kfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } -- cgit v1.2.3 From 594aa5975b9b5cfe9edaec06170e43b8c0607377 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:47 +0200 Subject: rcu/tree: Repeat the monitor if any free channel is busy It is possible that one of the channels cannot be detached because its free channel is busy and previously queued data has not been processed yet. On the other hand, another channel can be successfully detached causing the monitor work to stop. Prevent that by rescheduling the monitor work if there are any channels in the pending state after a detach attempt. Fixes: 34c881745549e ("rcu: Support kfree_bulk() interface in kfree_rcu()") Acked-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e0425faf3b3b..5151fe4e1429 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3105,7 +3105,7 @@ static void kfree_rcu_work(struct work_struct *work) static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; - bool queued = false; + bool repeat = false; int i; lockdep_assert_held(&krcp->lock); @@ -3143,11 +3143,14 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) * been detached following each other, one by one. */ queue_rcu_work(system_wq, &krwp->rcu_work); - queued = true; } + + /* Repeat if any "free" corresponding channel is still busy. */ + if (krcp->bhead || krcp->head) + repeat = true; } - return queued; + return !repeat; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, -- cgit v1.2.3 From 446044eb9c9c335d3ae1be4665193ab43ebb284e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:48 +0200 Subject: rcu/tree: Make debug_objects logic independent of rcu_head kfree_rcu()'s debug_objects logic uses the address of the object's embedded rcu_head to queue/unqueue. Instead of this, make use of the object's address itself as preparation for future headless kfree_rcu() support. Reviewed-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5151fe4e1429..143c1e9265b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2970,13 +2970,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * @nr_records: Number of active pointers in the array * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain - * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set */ struct kfree_rcu_bulk_data { unsigned long nr_records; void *records[KFREE_BULK_MAX_ENTR]; struct kfree_rcu_bulk_data *next; - struct rcu_head *head_free_debug; }; /** @@ -3026,11 +3024,13 @@ struct kfree_rcu_cpu { static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); static __always_inline void -debug_rcu_head_unqueue_bulk(struct rcu_head *head) +debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - for (; head; head = head->next) - debug_rcu_head_unqueue(head); + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); #endif } @@ -3060,7 +3060,7 @@ static void kfree_rcu_work(struct work_struct *work) for (; bhead; bhead = bnext) { bnext = bhead->next; - debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + debug_rcu_bhead_unqueue(bhead); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, @@ -3082,14 +3082,15 @@ static void kfree_rcu_work(struct work_struct *work) */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; + void *ptr = (void *)head - offset; next = head->next; - debug_rcu_head_unqueue(head); + debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree((void *)head - offset); + kfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3228,18 +3229,11 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; bnode->next = krcp->bhead; - bnode->head_free_debug = NULL; /* Attach it to the head. */ krcp->bhead = bnode; } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - head->func = func; - head->next = krcp->bhead->head_free_debug; - krcp->bhead->head_free_debug = head; -#endif - /* Finally insert. */ krcp->bhead->records[krcp->bhead->nr_records++] = (void *) head - (unsigned long) func; @@ -3263,14 +3257,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + void *ptr; local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) raw_spin_lock(&krcp->lock); + ptr = (void *)head - (unsigned long)func; + // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(head)) { + if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); -- cgit v1.2.3 From 3af84862817403d317dc33312e7a88d76e79401a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:49 +0200 Subject: rcu/tree: Simplify KFREE_BULK_MAX_ENTR macro We can simplify KFREE_BULK_MAX_ENTR macro and get rid of magic numbers which were used to make the structure to be exactly one page. Suggested-by: Boqun Feng Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 143c1e9265b6..bcdc06364426 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2958,13 +2958,6 @@ EXPORT_SYMBOL_GPL(call_rcu); #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) - /** * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers * @nr_records: Number of active pointers in the array @@ -2973,10 +2966,18 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kfree_rcu_bulk_data { unsigned long nr_records; - void *records[KFREE_BULK_MAX_ENTR]; struct kfree_rcu_bulk_data *next; + void *records[]; }; +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period -- cgit v1.2.3 From 952371d6fc0bc360d1d5780f86bb355836117ca2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:50 +0200 Subject: rcu/tree: Move kfree_rcu_cpu locking/unlocking to separate functions Introduce helpers to lock and unlock per-cpu "kfree_rcu_cpu" structures. That will make kfree_call_rcu() more readable and prevent programming errors. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcdc06364426..368bdc441ffb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3035,6 +3035,27 @@ debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) #endif } +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (likely(krcp->initialized)) + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + if (likely(krcp->initialized)) + raw_spin_unlock(&krcp->lock); + local_irq_restore(flags); +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3260,11 +3281,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) struct kfree_rcu_cpu *krcp; void *ptr; - local_irq_save(flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - if (krcp->initialized) - raw_spin_lock(&krcp->lock); - + krcp = krc_this_cpu_lock(&flags); ptr = (void *)head - (unsigned long)func; // Queue the object but don't yet schedule the batch. @@ -3295,9 +3312,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } unlock_return: - if (krcp->initialized) - raw_spin_unlock(&krcp->lock); - local_irq_restore(flags); + krc_this_cpu_unlock(krcp, flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); -- cgit v1.2.3 From 69f08d3999dbef1553a3332b8055282dd3893b6c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 May 2020 23:47:51 +0200 Subject: rcu/tree: Use static initializer for krc.lock The per-CPU variable is initialized at runtime in kfree_rcu_batch_init(). This function is invoked before 'rcu_scheduler_active' is set to 'RCU_SCHEDULER_RUNNING'. After the initialisation, '->initialized' is to true. The raw_spin_lock is only acquired if '->initialized' is set to true. The worqueue item is only used if 'rcu_scheduler_active' set to RCU_SCHEDULER_RUNNING which happens after initialisation. Use a static initializer for krc.lock and remove the runtime initialisation of the lock. Since the lock can now be always acquired, remove the '->initialized' check. Cc: Sebastian Andrzej Siewior Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 368bdc441ffb..a42a4693f161 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3002,7 +3002,7 @@ struct kfree_rcu_cpu_work { * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending - * @initialized: The @lock and @rcu_work fields have been initialized + * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in @@ -3022,7 +3022,9 @@ struct kfree_rcu_cpu { int count; }; -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; static __always_inline void debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) @@ -3042,8 +3044,7 @@ krc_this_cpu_lock(unsigned long *flags) local_irq_save(*flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); - if (likely(krcp->initialized)) - raw_spin_lock(&krcp->lock); + raw_spin_lock(&krcp->lock); return krcp; } @@ -3051,8 +3052,7 @@ krc_this_cpu_lock(unsigned long *flags) static inline void krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { - if (likely(krcp->initialized)) - raw_spin_unlock(&krcp->lock); + raw_spin_unlock(&krcp->lock); local_irq_restore(flags); } @@ -4278,7 +4278,6 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - raw_spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; -- cgit v1.2.3 From 53c72b590b3a0afd6747d6f7957e6838003e90a4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:52 +0200 Subject: rcu/tree: cache specified number of objects In order to reduce the dynamic need for pages in kfree_rcu(), pre-allocate a configurable number of pages per CPU and link them in a list. When kfree_rcu() reclaims objects, the object's container page is cached into a list instead of being released to the low-level page allocator. Such an approach provides O(1) access to free pages while also reducing the number of requests to the page allocator. It also makes the kfree_rcu() code to have free pages available during a low memory condition. A read-only sysfs parameter (rcu_min_cached_objs) reflects the minimum number of allowed cached pages per CPU. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a42a4693f161..37c0cd0332f8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -175,6 +175,15 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 2; +module_param(rcu_min_cached_objs, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -2997,7 +3006,6 @@ struct kfree_rcu_cpu_work { * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period - * @bcached: Keeps at most one object for later reuse when build chain blocks * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -3013,13 +3021,22 @@ struct kfree_rcu_cpu_work { struct kfree_rcu_cpu { struct rcu_head *head; struct kfree_rcu_bulk_data *bhead; - struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; int count; + + /* + * A simple cache list that contains objects for + * reuse purpose. In order to save some per-cpu + * space the list is singular. Even though it is + * lockless an access has to be protected by the + * per-cpu lock. + */ + struct llist_head bkvcache; + int nr_bkv_objs; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { @@ -3056,6 +3073,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } +static inline struct kfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + krcp->nr_bkv_objs--; + return (struct kfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + krcp->nr_bkv_objs++; + return true; + +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3091,7 +3133,12 @@ static void kfree_rcu_work(struct work_struct *work) kfree_bulk(bhead->nr_records, bhead->records); rcu_lock_release(&rcu_callback_map); - if (cmpxchg(&krcp->bcached, NULL, bhead)) + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bhead)) + bhead = NULL; + krc_this_cpu_unlock(krcp, flags); + + if (bhead) free_page((unsigned long) bhead); cond_resched_tasks_rcu_qs(); @@ -3224,7 +3271,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Check if a new block is required. */ if (!krcp->bhead || krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { - bnode = xchg(&krcp->bcached, NULL); + bnode = get_cached_bnode(krcp); if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); @@ -4277,12 +4324,23 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + struct kfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (bnode) + put_cached_bnode(krcp, bnode); + else + pr_err("Failed to preallocate for %d CPU!\n", cpu); + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } -- cgit v1.2.3 From 5f3c8d620447d509e534962e23f7edfb85f4e533 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:53 +0200 Subject: rcu/tree: Maintain separate array for vmalloc ptrs To do so, we use an array of kvfree_rcu_bulk_data structures. It consists of two elements: - index number 0 corresponds to slab pointers. - index number 1 corresponds to vmalloc pointers. Keeping vmalloc pointers separated from slab pointers makes it possible to invoke the right freeing API for the right kind of pointer. It also prepares us for future headless support for vmalloc and SLAB objects. Such objects cannot be queued on a linked list and are instead directly into an array. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Reviewed-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 173 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 100 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 37c0cd0332f8..67c4b984c499 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -2966,46 +2968,47 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 +#define FREE_N_CHANNELS 2 /** - * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers * @nr_records: Number of active pointers in the array - * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain + * @records: Array of the kvfree_rcu() pointers */ -struct kfree_rcu_bulk_data { +struct kvfree_rcu_bulk_data { unsigned long nr_records; - struct kfree_rcu_bulk_data *next; + struct kvfree_rcu_bulk_data *next; void *records[]; }; /* * This macro defines how many entries the "records" array * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. + * kvfree_rcu_bulk_data structure becomes exactly one page. */ -#define KFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period + * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kfree_rcu_bulk_data *bhead_free; + struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period + * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -3020,7 +3023,7 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; - struct kfree_rcu_bulk_data *bhead; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; @@ -3044,7 +3047,7 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { }; static __always_inline void -debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD int i; @@ -3073,20 +3076,20 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } -static inline struct kfree_rcu_bulk_data * +static inline struct kvfree_rcu_bulk_data * get_cached_bnode(struct kfree_rcu_cpu *krcp) { if (!krcp->nr_bkv_objs) return NULL; krcp->nr_bkv_objs--; - return (struct kfree_rcu_bulk_data *) + return (struct kvfree_rcu_bulk_data *) llist_del_first(&krcp->bkvcache); } static inline bool put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kfree_rcu_bulk_data *bnode) + struct kvfree_rcu_bulk_data *bnode) { // Check the limit. if (krcp->nr_bkv_objs >= rcu_min_cached_objs) @@ -3105,43 +3108,63 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp, static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; struct rcu_head *head, *next; - struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + int i, j; krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) { + bkvhead[i] = krwp->bkvhead_free[i]; + krwp->bkvhead_free[i] = NULL; + } + + // Channel 3. head = krwp->head_free; krwp->head_free = NULL; - bhead = krwp->bhead_free; - krwp->bhead_free = NULL; raw_spin_unlock_irqrestore(&krcp->lock, flags); - /* "bhead" is now private, so traverse locklessly. */ - for (; bhead; bhead = bnext) { - bnext = bhead->next; - - debug_rcu_bhead_unqueue(bhead); - - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, - bhead->nr_records, bhead->records); - - kfree_bulk(bhead->nr_records, bhead->records); - rcu_lock_release(&rcu_callback_map); + // Handle two first channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + for (; bkvhead[i]; bkvhead[i] = bnext) { + bnext = bkvhead[i]->next; + debug_rcu_bhead_unqueue(bkvhead[i]); + + rcu_lock_acquire(&rcu_callback_map); + if (i == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bkvhead[i]->nr_records, + bkvhead[i]->records); + + kfree_bulk(bkvhead[i]->nr_records, + bkvhead[i]->records); + } else { // vmalloc() / vfree(). + for (j = 0; j < bkvhead[i]->nr_records; j++) { + trace_rcu_invoke_kfree_callback( + rcu_state.name, + bkvhead[i]->records[j], 0); + + vfree(bkvhead[i]->records[j]); + } + } + rcu_lock_release(&rcu_callback_map); - krcp = krc_this_cpu_lock(&flags); - if (put_cached_bnode(krcp, bhead)) - bhead = NULL; - krc_this_cpu_unlock(krcp, flags); + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bkvhead[i])) + bkvhead[i] = NULL; + krc_this_cpu_unlock(krcp, flags); - if (bhead) - free_page((unsigned long) bhead); + if (bkvhead[i]) + free_page((unsigned long) bkvhead[i]); - cond_resched_tasks_rcu_qs(); + cond_resched_tasks_rcu_qs(); + } } /* @@ -3159,7 +3182,7 @@ static void kfree_rcu_work(struct work_struct *work) trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3176,7 +3199,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; bool repeat = false; - int i; + int i, j; lockdep_assert_held(&krcp->lock); @@ -3184,21 +3207,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krwp = &(krcp->krw_arr[i]); /* - * Try to detach bhead or head and attach it over any + * Try to detach bkvhead or head and attach it over any * available corresponding free channel. It can be that * a previous RCU batch is in progress, it means that * immediately to queue another one is not possible so * return false to tell caller to retry. */ - if ((krcp->bhead && !krwp->bhead_free) || + if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || + (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - /* Channel 1. */ - if (!krwp->bhead_free) { - krwp->bhead_free = krcp->bhead; - krcp->bhead = NULL; + // Channel 1 corresponds to SLAB ptrs. + // Channel 2 corresponds to vmalloc ptrs. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (!krwp->bkvhead_free[j]) { + krwp->bkvhead_free[j] = krcp->bkvhead[j]; + krcp->bkvhead[j] = NULL; + } } - /* Channel 2. */ + // Channel 3 corresponds to emergency path. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; @@ -3207,16 +3234,17 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) WRITE_ONCE(krcp->count, 0); /* - * One work is per one batch, so there are two "free channels", - * "bhead_free" and "head_free" the batch can handle. It can be - * that the work is in the pending state when two channels have - * been detached following each other, one by one. + * One work is per one batch, so there are three + * "free channels", the batch can handle. It can + * be that the work is in the pending state when + * channels have been detached following by each + * other. */ queue_rcu_work(system_wq, &krwp->rcu_work); } - /* Repeat if any "free" corresponding channel is still busy. */ - if (krcp->bhead || krcp->head) + // Repeat if any "free" corresponding channel is still busy. + if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) repeat = true; } @@ -3258,23 +3286,22 @@ static void kfree_rcu_monitor(struct work_struct *work) } static inline bool -kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, - struct rcu_head *head, rcu_callback_t func) +kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; + int idx; if (unlikely(!krcp->initialized)) return false; lockdep_assert_held(&krcp->lock); + idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bhead || - krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { + if (!krcp->bkvhead[idx] || + krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(krcp); if (!bnode) { - WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); - /* * To keep this path working on raw non-preemptible * sections, prevent the optional entry into the @@ -3287,7 +3314,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; - bnode = (struct kfree_rcu_bulk_data *) + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3297,30 +3324,30 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bhead; + bnode->next = krcp->bkvhead[idx]; /* Attach it to the head. */ - krcp->bhead = bnode; + krcp->bkvhead[idx] = bnode; } /* Finally insert. */ - krcp->bhead->records[krcp->bhead->nr_records++] = - (void *) head - (unsigned long) func; + krcp->bkvhead[idx]->records + [krcp->bkvhead[idx]->nr_records++] = ptr; return true; } /* - * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace - * period. Please note there are two paths are maintained, one is the main one - * that uses kfree_bulk() interface and second one is emergency one, that is - * used only when the main path can not be maintained temporary, due to memory - * pressure. + * Queue a request for lazy invocation of appropriate free routine after a + * grace period. Please note there are three paths are maintained, two are the + * main ones that use array of pointers interface and third one is emergency + * one, that is used only when the main path can not be maintained temporary, + * due to memory pressure. * * Each kfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu() load. + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3343,7 +3370,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) { head->func = func; head->next = krcp->head; krcp->head = head; @@ -4324,7 +4351,7 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); @@ -4332,7 +4359,7 @@ static void __init kfree_rcu_batch_init(void) } for (i = 0; i < rcu_min_cached_objs; i++) { - bnode = (struct kfree_rcu_bulk_data *) + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (bnode) -- cgit v1.2.3 From 64d1d06ccb1b7de245ccf781b91517f328bebd9f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:54 +0200 Subject: rcu/tiny: support vmalloc in tiny-RCU Replace kfree() with kvfree() in rcu_reclaim_tiny(). This makes it possible to release either SLAB or vmalloc objects after a GP. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dd572ce7c747..4b99f7b88bee 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rcu.h" @@ -86,7 +87,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { trace_rcu_invoke_kfree_callback("", head, offset); - kfree((void *)head - offset); + kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } -- cgit v1.2.3 From c408b215f58f7156bb6bafb64c0263ee907033df Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:55 +0200 Subject: rcu: Rename *_kfree_callback/*_kfree_rcu_offset/kfree_call_* The following changes are introduced: 1. Rename rcu_invoke_kfree_callback() to rcu_invoke_kvfree_callback(), as well as the associated trace events, so the rcu_kfree_callback(), becomes rcu_kvfree_callback(). The reason is to be aligned with kvfree() notation. 2. Rename __is_kfree_rcu_offset to __is_kvfree_rcu_offset. All RCU paths use kvfree() now instead of kfree(), thus rename it. 3. Rename kfree_call_rcu() to the kvfree_call_rcu(). The reason is, it is capable of freeing vmalloc() memory now. Do the same with __kfree_rcu() macro, it becomes __kvfree_rcu(), the goal is the same. Reviewed-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tree.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b99f7b88bee..aa897c3f2e92 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,8 +85,8 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback("", head, offset); + if (__is_kvfree_rcu_offset(offset)) { + trace_rcu_invoke_kvfree_callback("", head, offset); kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 67c4b984c499..f22c47e72287 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2905,8 +2905,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) return; // Enqueued onto ->nocb_bypass, so just leave. // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rcu_state.name, head, + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, (unsigned long)func, rcu_segcblist_n_cbs(&rdp->cblist)); else @@ -3146,7 +3146,7 @@ static void kfree_rcu_work(struct work_struct *work) bkvhead[i]->records); } else { // vmalloc() / vfree(). for (j = 0; j < bkvhead[i]->nr_records; j++) { - trace_rcu_invoke_kfree_callback( + trace_rcu_invoke_kvfree_callback( rcu_state.name, bkvhead[i]->records[j], 0); @@ -3179,9 +3179,9 @@ static void kfree_rcu_work(struct work_struct *work) next = head->next; debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) kvfree(ptr); rcu_lock_release(&rcu_callback_map); @@ -3344,12 +3344,12 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) * one, that is used only when the main path can not be maintained temporary, * due to memory pressure. * - * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; @@ -3388,7 +3388,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: krc_this_cpu_unlock(krcp, flags); } -EXPORT_SYMBOL_GPL(kfree_call_rcu); +EXPORT_SYMBOL_GPL(kvfree_call_rcu); static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -- cgit v1.2.3 From 3042f83f19bec2e0cd356f72b39e4d816e8cd5ff Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:58 +0200 Subject: rcu: Support reclaim for head-less object Update the kvfree_call_rcu() function with head-less support. This allows RCU to reclaim objects without an embedded rcu_head. tree-RCU: We introduce two chains of arrays to store SLAB-backed and vmalloc pointers, each. Storage in either of these arrays does not require embedding an rcu_head within the object. Maintaining the arrays may become impossible due to high memory pressure. For such cases there is an emergency path. Objects with rcu_head inside are just queued on a backup rcu_head list. Later on that list is drained. As for the head-less variant, as the current context can sleep, the following emergency measures are applied: a) Synchronously wait until a grace period has elapsed. b) Call kvfree(). tiny-RCU: For double argument calls, there are no new changes in behavior. For single argument call, kvfree() is directly inlined on the current stack after a synchronize_rcu() call. Note that for tiny-RCU, any call to synchronize_rcu() is actually a quiescent state, therefore it does nothing. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f22c47e72287..01f29e4500ba 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3314,6 +3314,13 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; + /* + * NOTE: For one argument of kvfree_rcu() we can + * drop the lock and get the page in sleepable + * context. That would allow to maintain an array + * for the CONFIG_PREEMPT_RT as well if no cached + * pages are available. + */ bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3353,16 +3360,33 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + bool success; void *ptr; + if (head) { + ptr = (void *) head - (unsigned long) func; + } else { + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + might_sleep(); + ptr = (unsigned long *) func; + } + krcp = krc_this_cpu_lock(&flags); - ptr = (void *)head - (unsigned long)func; // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); + + // Mark as success and leave. + success = true; goto unlock_return; } @@ -3370,10 +3394,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) { + success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + if (!success) { + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + head->func = func; head->next = krcp->head; krcp->head = head; + success = true; } WRITE_ONCE(krcp->count, krcp->count + 1); @@ -3387,6 +3417,17 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } } EXPORT_SYMBOL_GPL(kvfree_call_rcu); -- cgit v1.2.3 From ea6eed9f7d7382c7230202d4c3bf74185f193394 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:47:13 -0700 Subject: rcu-tasks: Convert sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() and schedule_timeout_uninterruptible() calls used by the various Tasks RCU's grace-period kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with Tasks-RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce23f6cc5043..91fee8122acd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -205,7 +205,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); - schedule_timeout_interruptible(HZ/10); + schedule_timeout_idle(HZ/10); } continue; } @@ -227,7 +227,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) cond_resched(); } /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); + schedule_timeout_idle(HZ/10); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } @@ -336,7 +336,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) /* Slowly back off waiting for holdouts */ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_interruptible(HZ/fract); + schedule_timeout_idle(HZ/fract); if (fract > 1) fract--; -- cgit v1.2.3 From 04a3c5aa7a8cb2ce97f9beb627ba742bc8b0fe03 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 19:27:06 -0700 Subject: rcu-tasks: Make rcu_tasks_postscan() be static The rcu_tasks_postscan() function is not used outside of RCU's tasks.h file, so this commit makes it be static. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 91fee8122acd..da200e53d60d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -402,7 +402,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } /* Processing between scanning taskslist and draining the holdout list. */ -void rcu_tasks_postscan(struct list_head *hop) +static void rcu_tasks_postscan(struct list_head *hop) { /* * Wait for tasks that are in the process of exiting. This -- cgit v1.2.3 From 5b3cc99bedf5885055fbaf35fe63d205f06b5be5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 19:33:47 -0700 Subject: rcu-tasks: Add #include of rcupdate_trace.h to update.c Although this is in some strict sense unnecessary, it is good to allow the compiler to compare the function declaration with its definition. This commit therefore adds a #include of linux/rcupdate_trace.h to kernel/rcu/update.c. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84843adfd939..c0fea809d738 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -42,6 +42,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS -- cgit v1.2.3 From 8344496e8b49c4122c1808d6cd3f8dc71bccb595 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 20:03:48 -0700 Subject: rcu-tasks: Conditionally compile show_rcu_tasks_gp_kthreads() The show_rcu_tasks_gp_kthreads() function is not invoked by Tiny RCU, but is nevertheless defined in Tiny RCU builds that enable Tasks Trace RCU. This commit therefore conditionally compiles this function so that it is defined only in builds that actually use it. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index da200e53d60d..d5c003c1972c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644); #define RTGS_WAIT_READERS 9 #define RTGS_INVOKE_CBS 10 #define RTGS_WAIT_CBS 11 +#ifndef CONFIG_TINY_RCU static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INIT", "RTGS_WAIT_WAIT_CBS", @@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INVOKE_CBS", "RTGS_WAIT_CBS", }; +#endif /* #ifndef CONFIG_TINY_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) rtp->gp_jiffies = jiffies; } +#ifndef CONFIG_TINY_RCU /* Return state name. */ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) { @@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) return "???"; return rcu_tasks_gp_state_names[j]; } +#endif /* #ifndef CONFIG_TINY_RCU */ // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, @@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifndef CONFIG_TINY_RCU */ +#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) ".C"[!!data_race(rtp->cbs_head)], s); } +#endif /* #ifndef CONFIG_TINY_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_classic_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_rude_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RUDE_RCU */ static void show_rcu_tasks_rude_gp_kthread(void) {} @@ -1164,6 +1174,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; @@ -1174,18 +1185,21 @@ static void show_rcu_tasks_trace_gp_kthread(void) data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ +#ifndef CONFIG_TINY_RCU void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); show_rcu_tasks_trace_gp_kthread(); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} -- cgit v1.2.3 From 30d8aa5128f12c9d781b67c9694c1abfa4f6ce6a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Jun 2020 09:24:51 -0700 Subject: rcu-tasks: Fix code-style issues This commit declares trc_n_readers_need_end and trc_wait static and replaced a "&" with "&&". The "&" happened to work because the values are bool, but accidents waiting to happen and all that... Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5c003c1972c..828f222895f1 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -737,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map); #ifdef CONFIG_TASKS_TRACE_RCU -atomic_t trc_n_readers_need_end; // Number of waited-for readers. -DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. +static atomic_t trc_n_readers_need_end; // Number of waited-for readers. +static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); @@ -845,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) bool ofl = cpu_is_offline(cpu); if (task_curr(t)) { - WARN_ON_ONCE(ofl & !is_idle_task(t)); + WARN_ON_ONCE(ofl && !is_idle_task(t)); // If no chance of heavyweight readers, do it the hard way. if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) -- cgit v1.2.3 From 7e866460cc18797b3a59360f5f8c444598a21729 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 00:36:47 -0400 Subject: rcuperf: Remove useless while loops around wait_event wait_event() already retries if the condition for the wake up is not satisifed after wake up. Remove them from the rcuperf test. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..246da8fe199e 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -576,11 +576,8 @@ static int compute_real(int n) static int rcu_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= - nrealwriters); - } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_perf_cleanup(); kernel_power_off(); @@ -693,11 +690,8 @@ kfree_perf_cleanup(void) static int kfree_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= - kfree_nrealthreads); - } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ -- cgit v1.2.3 From 653ed64b01dc5989f8f579d0038e987476c2c023 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 00:36:48 -0400 Subject: refperf: Add a test to measure performance of read-side synchronization Add a test for comparing the performance of RCU with various read-side synchronization mechanisms. The test has proved useful for collecting data and performing these comparisons. Currently RCU, SRCU, reader-writer lock, reader-writer semaphore and reference counting can be measured using refperf.perf_type parameter. Each invocation of the test runs measures performance of a specific mechanism. The maximum number of CPUs to concurrently run readers on is chosen by the test itself and is 75% of the total number of CPUs. So if you had 24 CPUs, the test runs with a maximum of 18 parallel readers. A number of experiments are conducted, and in each experiment, the number of readers is increased by 1, upto the 75% of CPUs mark. During each experiment, all readers execute an empty loop with refperf.loops iterations and time the total loop duration. This is then averaged. Example output: Parameters "refperf.perf_type=srcu refperf.loops=2000000" looks like: [ 3.347133] srcu-ref-perf: [ 3.347133] Threads Time(ns) [ 3.347133] 1 36 [ 3.347133] 2 34 [ 3.347133] 3 34 [ 3.347133] 4 34 [ 3.347133] 5 33 [ 3.347133] 6 33 [ 3.347133] 7 33 [ 3.347133] 8 33 [ 3.347133] 9 33 [ 3.347133] 10 33 [ 3.347133] 11 33 [ 3.347133] 12 33 [ 3.347133] 13 33 [ 3.347133] 14 33 [ 3.347133] 15 32 [ 3.347133] 16 33 [ 3.347133] 17 33 [ 3.347133] 18 34 Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 19 ++ kernel/rcu/Makefile | 1 + kernel/rcu/refperf.c | 558 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 578 insertions(+) create mode 100644 kernel/rcu/refperf.c (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 452feae8de20..858765b7f644 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -61,6 +61,25 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_REF_PERF_TEST + tristate "Performance tests for read-side synchronization (RCU and others)" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU + default n + help + This option provides a kernel module that runs performance tests + useful comparing RCU with various read-side synchronization mechanisms. + The kernel module may be built after the fact on the running kernel to be + tested, if desired. + + Say Y here if you want these performance tests built into the kernel. + Say M if you want to build it as a module instead. + Say N if you are unsure. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index f91f2c2cf138..ba7d82609cbe 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c new file mode 100644 index 000000000000..61161530acc8 --- /dev/null +++ b/kernel/rcu/refperf.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Performance test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcu.h" + +#define PERF_FLAG "-ref-perf: " + +#define PERFOUT(s, x...) \ + pr_alert("%s" PERF_FLAG s, perf_type, ## x) + +#define VERBOSE_PERFOUT(s, x...) \ + do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0) + +#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) "); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); + +// Number of loops per experiment, all readers execute an operation concurrently +torture_param(long, loops, 10000000, "Number of loops per experiment."); + +#ifdef MODULE +# define REFPERF_SHUTDOWN 0 +#else +# define REFPERF_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFPERF_SHUTDOWN, + "Shutdown at end of performance tests."); + +struct reader_task { + struct task_struct *task; + atomic_t start; + wait_queue_head_t wq; + u64 last_duration_ns; + + // The average latency When 1.. are concurrently + // running an experiment. For example, if this reader_task is + // of index 5 in the reader_tasks array, then result is for + // 6 cores. + u64 result_avg; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; +static int nreaders; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_perf_ops { + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + const char *name; +}; + +static struct ref_perf_ops *cur_ops; + +// Definitions for RCU ref perf testing. +static int ref_rcu_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void ref_rcu_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct ref_perf_ops rcu_ops = { + .init = rcu_sync_perf_init, + .readlock = ref_rcu_read_lock, + .readunlock = ref_rcu_read_unlock, + .name = "rcu" +}; + + +// Definitions for SRCU ref perf testing. +DEFINE_STATIC_SRCU(srcu_refctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; + +static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static struct ref_perf_ops srcu_ops = { + .init = rcu_sync_perf_init, + .readlock = srcu_ref_perf_read_lock, + .readunlock = srcu_ref_perf_read_unlock, + .name = "srcu" +}; + +// Definitions for reference count +static atomic_t refcnt; + +static int srcu_ref_perf_refcnt_lock(void) +{ + atomic_inc(&refcnt); + return 0; +} + +static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp) +{ + atomic_dec(&refcnt); + srcu_read_unlock(srcu_ctlp, idx); +} + +static struct ref_perf_ops refcnt_ops = { + .init = rcu_sync_perf_init, + .readlock = srcu_ref_perf_refcnt_lock, + .readunlock = srcu_ref_perf_refcnt_unlock, + .name = "refcnt" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static void ref_perf_rwlock_init(void) +{ + rwlock_init(&test_rwlock); +} + +static int ref_perf_rwlock_lock(void) +{ + read_lock(&test_rwlock); + return 0; +} + +static void ref_perf_rwlock_unlock(int idx) +{ + read_unlock(&test_rwlock); +} + +static struct ref_perf_ops rwlock_ops = { + .init = ref_perf_rwlock_init, + .readlock = ref_perf_rwlock_lock, + .readunlock = ref_perf_rwlock_unlock, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static void ref_perf_rwsem_init(void) +{ + init_rwsem(&test_rwsem); +} + +static int ref_perf_rwsem_lock(void) +{ + down_read(&test_rwsem); + return 0; +} + +static void ref_perf_rwsem_unlock(int idx) +{ + up_read(&test_rwsem); +} + +static struct ref_perf_ops rwsem_ops = { + .init = ref_perf_rwsem_init, + .readlock = ref_perf_rwsem_lock, + .readunlock = ref_perf_rwsem_unlock, + .name = "rwsem" +}; + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_perf_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + unsigned long spincnt; + int idx; + u64 start; + s64 duration; + + VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); +repeat: + VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != me); + + atomic_dec(&rt->start); + + // To prevent noise, keep interrupts disabled. This also has the + // effect of preventing entries into slow path for rcu_read_unlock(). + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); + + for (spincnt = 0; spincnt < loops; spincnt++) { + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + } + + duration = ktime_get_mono_fast_ns() - start; + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + + atomic_dec(&nreaders_exp); + + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!atomic_read(&nreaders_exp)) + wake_up(&main_wq); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_perf_reader"); + return 0; +} + +void reset_readers(int n) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < n; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + char buf1[64]; + char buf[512]; + u64 sum = 0; + + buf[0] = 0; + sprintf(buf, "Experiment #%d (Format: :)", + exp_idx); + + for (i = 0; i <= n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); + + if (i % 5 == 0) + strcat(buf, "\n"); + strcat(buf, buf1); + + sum += rt->last_duration_ns; + } + strcat(buf, "\n"); + + PERFOUT("%s\n", buf); + + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + int exp, r; + char buf1[64]; + char buf[512]; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_PERFOUT("main_func task started"); + atomic_inc(&n_init); + + // Wait for all threads to start. + wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); + + // Start exp readers up per experiment + for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { + if (torture_must_stop()) + goto end; + + reset_readers(exp); + atomic_set(&nreaders_exp, exp + 1); + + exp_idx = exp; + + for (r = 0; r <= exp; r++) { + atomic_set(&reader_tasks[r].start, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", + exp); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_PERFOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops); + } + + // Print the average of all experiments + PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Threads\tTime(ns)\n"); + + for (exp = 0; exp < nreaders; exp++) { + sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg); + strcat(buf, buf1); + } + + PERFOUT("%s", buf); + + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + return 0; +} + +static void +ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: verbose=%d shutdown=%d loops=%ld\n", perf_type, tag, + verbose, shutdown, loops); +} + +static void +ref_perf_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_perf_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + + torture_stop_kthread("main_task", main_task); + kfree(main_task); + + // Do perf-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_perf_shutdown(void *arg) +{ + wait_event(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_perf_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_perf_init(void) +{ + long i; + int firsterr = 0; + static struct ref_perf_ops *perf_ops[] = { + &rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + }; + + if (!torture_init_begin(perf_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_cont(" %s", perf_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + ref_perf_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (~75% of online CPUs). + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders); + + for (i = 0; i < nreaders; i++) { + firsterr = torture_create_kthread(ref_perf_reader, (void *)i, + reader_tasks[i].task); + if (firsterr) + goto unwind; + + init_waitqueue_head(&(reader_tasks[i].wq)); + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + + + // Wait until all threads start + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + wake_up(&main_wq); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_perf_cleanup(); + return firsterr; +} + +module_init(ref_perf_init); +module_exit(ref_perf_cleanup); -- cgit v1.2.3 From 708cda31652c02e64adaeafafe7b996e4e14c3eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 09:22:24 -0700 Subject: rcuperf: Add comments explaining the high reader overhead This commit adds comments explaining why the readers have otherwise insane levels of measurement overhead, namely that they are intended as a test load for update-side performance measurements, not as a straight-up read-side performance test. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 246da8fe199e..d906ca987936 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney "); * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. + * + * Note that this test's readers are intended only as a test load for + * the writers. The reader performance statistics will be overly + * pessimistic due to the per-critical-section interrupt disabling, + * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE @@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void) } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side - * critical section, minimizing update-side interference. + * RCU perf reader kthread. Repeatedly does empty RCU read-side critical + * section, minimizing update-side interference. However, the point of + * this test is not to evaluate reader performance, but instead to serve + * as a test load for update-side performance testing. */ static int rcu_perf_reader(void *arg) -- cgit v1.2.3 From 777a54c908ec69fa0eccab54068a49ecda38ffde Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 14:16:44 -0700 Subject: refperf: Add holdoff parameter to allow CPUs to come online This commit adds an rcuperf module parameter named "holdoff" that defaults to 10 seconds if refperf is built in and to zero otherwise. The assumption is that all the CPUs are online by the time that the modprobe and insmod commands are going to do anything, and that normal systems will have all the CPUs online within ten seconds. Larger systems may take many tens of seconds or even minutes to get to this point, hence this being a module parameter instead of being a hard-coded constant. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 61161530acc8..4d686fdc3105 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -57,7 +57,10 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); -// Number of loops per experiment, all readers execute an operation concurrently +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000000, "Number of loops per experiment."); #ifdef MODULE @@ -248,6 +251,8 @@ ref_perf_reader(void *arg) set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); @@ -357,6 +362,8 @@ static int main_func(void *arg) // Wait for all threads to start. wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); // Start exp readers up per experiment for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { @@ -420,8 +427,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d loops=%ld\n", perf_type, tag, - verbose, shutdown, loops); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag, + verbose, shutdown, holdoff, loops); } static void -- cgit v1.2.3 From 75dd8efef56ed5959c398974c785026f84aa0d1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 14:59:06 -0700 Subject: refperf: Hoist function-pointer calls out of the loop Current runs show PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs consuming between 20 and 30 nanoseconds, when in fact the actual value is zero, give or take the barrier() asm's effect on compiler optimizations. The additional overhead is caused by function calls through pointers (especially in these days of Spectre mitigations) and perhaps also needless argument passing, a non-const loop limit, and an upcounting loop. This commit therefore combines the ->readlock() and ->readunlock() function pointers into a single ->readsection() function pointer that takes the loop count as a const parameter and keeps any data passed from the read-lock to the read-unlock internal to this new function. These changes reduce the measured overhead of the aforementioned PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs from between 20 and 30 nanoseconds to somewhere south of 500 picoseconds. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 92 ++++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 4d686fdc3105..57c7b7a40bd2 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -108,23 +108,20 @@ static int exp_idx; struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); - int (*readlock)(void); - void (*readunlock)(int idx); + void (*readsection)(const int nloops); const char *name; }; static struct ref_perf_ops *cur_ops; -// Definitions for RCU ref perf testing. -static int ref_rcu_read_lock(void) __acquires(RCU) +static void ref_rcu_read_section(const int nloops) { - rcu_read_lock(); - return 0; -} + int i; -static void ref_rcu_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } } static void rcu_sync_perf_init(void) @@ -133,8 +130,7 @@ static void rcu_sync_perf_init(void) static struct ref_perf_ops rcu_ops = { .init = rcu_sync_perf_init, - .readlock = ref_rcu_read_lock, - .readunlock = ref_rcu_read_unlock, + .readsection = ref_rcu_read_section, .name = "rcu" }; @@ -143,42 +139,39 @@ static struct ref_perf_ops rcu_ops = { DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; -static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp) +static void srcu_ref_perf_read_section(int nloops) { - return srcu_read_lock(srcu_ctlp); -} + int i; + int idx; -static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp) -{ - srcu_read_unlock(srcu_ctlp, idx); + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } } static struct ref_perf_ops srcu_ops = { .init = rcu_sync_perf_init, - .readlock = srcu_ref_perf_read_lock, - .readunlock = srcu_ref_perf_read_unlock, + .readsection = srcu_ref_perf_read_section, .name = "srcu" }; // Definitions for reference count static atomic_t refcnt; -static int srcu_ref_perf_refcnt_lock(void) +static void ref_perf_refcnt_section(const int nloops) { - atomic_inc(&refcnt); - return 0; -} + int i; -static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp) -{ - atomic_dec(&refcnt); - srcu_read_unlock(srcu_ctlp, idx); + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } } static struct ref_perf_ops refcnt_ops = { .init = rcu_sync_perf_init, - .readlock = srcu_ref_perf_refcnt_lock, - .readunlock = srcu_ref_perf_refcnt_unlock, + .readsection = ref_perf_refcnt_section, .name = "refcnt" }; @@ -190,21 +183,19 @@ static void ref_perf_rwlock_init(void) rwlock_init(&test_rwlock); } -static int ref_perf_rwlock_lock(void) +static void ref_perf_rwlock_section(const int nloops) { - read_lock(&test_rwlock); - return 0; -} + int i; -static void ref_perf_rwlock_unlock(int idx) -{ - read_unlock(&test_rwlock); + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } } static struct ref_perf_ops rwlock_ops = { .init = ref_perf_rwlock_init, - .readlock = ref_perf_rwlock_lock, - .readunlock = ref_perf_rwlock_unlock, + .readsection = ref_perf_rwlock_section, .name = "rwlock" }; @@ -216,21 +207,19 @@ static void ref_perf_rwsem_init(void) init_rwsem(&test_rwsem); } -static int ref_perf_rwsem_lock(void) +static void ref_perf_rwsem_section(const int nloops) { - down_read(&test_rwsem); - return 0; -} + int i; -static void ref_perf_rwsem_unlock(int idx) -{ - up_read(&test_rwsem); + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } } static struct ref_perf_ops rwsem_ops = { .init = ref_perf_rwsem_init, - .readlock = ref_perf_rwsem_lock, - .readunlock = ref_perf_rwsem_unlock, + .readsection = ref_perf_rwsem_section, .name = "rwsem" }; @@ -242,8 +231,6 @@ ref_perf_reader(void *arg) unsigned long flags; long me = (long)arg; struct reader_task *rt = &(reader_tasks[me]); - unsigned long spincnt; - int idx; u64 start; s64 duration; @@ -275,10 +262,7 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - for (spincnt = 0; spincnt < loops; spincnt++) { - idx = cur_ops->readlock(); - cur_ops->readunlock(idx); - } + cur_ops->readsection(loops); duration = ktime_get_mono_fast_ns() - start; local_irq_restore(flags); -- cgit v1.2.3 From 83b88c86da0e5f97faeac5a9bb19fe32f8c0394b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 15:31:07 -0700 Subject: refperf: Allow decimal nanoseconds The CONFIG_PREEMPT=n rcu_read_lock()/rcu_read_unlock() pair's overhead, even including loop overhead, is far less than one nanosecond. Since logscale plots are not all that happy with zero values, provide picoseconds as decimals. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 57c7b7a40bd2..e991d4820f51 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -375,7 +375,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops); + reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops); } // Print the average of all experiments @@ -386,7 +386,7 @@ static int main_func(void *arg) strcat(buf, "Threads\tTime(ns)\n"); for (exp = 0; exp < nreaders; exp++) { - sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg); + sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000)); strcat(buf, buf1); } -- cgit v1.2.3 From 8fc28783a0c3704ea27505a25dbde8333d75380c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 15:48:38 -0700 Subject: refperf: Convert nreaders to a module parameter This commit converts nreaders to a module parameter, with the default of -1 specifying the old behavior of using 75% of the readers. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index e991d4820f51..020e55a9a64b 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -62,6 +62,12 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -93,7 +99,6 @@ static wait_queue_head_t main_wq; static int shutdown_start; static struct reader_task *reader_tasks; -static int nreaders; // Number of readers that are part of the current experiment. static atomic_t nreaders_exp; @@ -411,8 +416,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag, - verbose, shutdown, holdoff, loops); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders); } static void @@ -501,8 +506,9 @@ ref_perf_init(void) schedule_timeout_uninterruptible(1); } - // Reader tasks (~75% of online CPUs). - nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { -- cgit v1.2.3 From dbf28efdae7bb51032eeb0fe1b6bd07d6f0f9b6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:22:24 -0700 Subject: refperf: Provide module parameter to specify number of experiments The current code uses the number of threads both to limit the number of threads and to specify the number of experiments, but also varies the number of threads as the experiments progress. This commit takes a different approach by adding an refperf.nruns module parameter that specifies the number of experiments, and furthermore uses the same number of threads for each experiment. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 020e55a9a64b..6324449db404 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -83,12 +83,6 @@ struct reader_task { atomic_t start; wait_queue_head_t wq; u64 last_duration_ns; - - // The average latency When 1.. are concurrently - // running an experiment. For example, if this reader_task is - // of index 5 in the reader_tasks array, then result is for - // 6 cores. - u64 result_avg; }; static struct task_struct *shutdown_task; @@ -289,12 +283,12 @@ end: return 0; } -void reset_readers(int n) +void reset_readers(void) { int i; struct reader_task *rt; - for (i = 0; i < n; i++) { + for (i = 0; i < nreaders; i++) { rt = &(reader_tasks[i]); rt->last_duration_ns = 0; @@ -314,7 +308,7 @@ u64 process_durations(int n) sprintf(buf, "Experiment #%d (Format: :)", exp_idx); - for (i = 0; i <= n && !torture_must_stop(); i++) { + for (i = 0; i < n && !torture_must_stop(); i++) { rt = &(reader_tasks[i]); sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); @@ -342,11 +336,15 @@ static int main_func(void *arg) int exp, r; char buf1[64]; char buf[512]; + u64 *result_avg; set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); set_user_nice(current, MAX_NICE); VERBOSE_PERFOUT("main_func task started"); + result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + if (!result_avg) + VERBOSE_PERFOUT_ERRSTRING("out of memory"); atomic_inc(&n_init); // Wait for all threads to start. @@ -355,22 +353,24 @@ static int main_func(void *arg) schedule_timeout_interruptible(holdoff * HZ); // Start exp readers up per experiment - for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (!result_avg) + break; if (torture_must_stop()) goto end; - reset_readers(exp); - atomic_set(&nreaders_exp, exp + 1); + reset_readers(); + atomic_set(&nreaders_exp, nreaders); exp_idx = exp; - for (r = 0; r <= exp; r++) { + for (r = 0; r < nreaders; r++) { atomic_set(&reader_tasks[r].start, 1); wake_up(&reader_tasks[r].wq); } VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", - exp); + nreaders); wait_event(main_wq, !atomic_read(&nreaders_exp) || torture_must_stop()); @@ -380,7 +380,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops); + result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops); } // Print the average of all experiments @@ -390,12 +390,15 @@ static int main_func(void *arg) strcat(buf, "\n"); strcat(buf, "Threads\tTime(ns)\n"); - for (exp = 0; exp < nreaders; exp++) { - sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000)); + for (exp = 0; exp < nruns; exp++) { + if (!result_avg) + break; + sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); strcat(buf, buf1); } - PERFOUT("%s", buf); + if (result_avg) + PERFOUT("%s", buf); // This will shutdown everything including us. if (shutdown) { @@ -416,8 +419,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag, - verbose, shutdown, holdoff, loops, nreaders); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns); } static void -- cgit v1.2.3 From f518f154ecef347777db33b7c9b0581f245159f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:32:56 -0700 Subject: refperf: Dynamically allocate experiment-summary output buffer Currently, the buffer used to accumulate the experiment-summary output is fixed size, which will cause problems if someone decides to run one hundred experiments. This commit therefore dynamically allocates this buffer. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 6324449db404..75b9cceaece1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -333,9 +333,10 @@ u64 process_durations(int n) // point all the timestamps are printed. static int main_func(void *arg) { + bool errexit = false; int exp, r; char buf1[64]; - char buf[512]; + char *buf; u64 *result_avg; set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); @@ -343,8 +344,11 @@ static int main_func(void *arg) VERBOSE_PERFOUT("main_func task started"); result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); - if (!result_avg) + buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + if (!result_avg || !buf) { VERBOSE_PERFOUT_ERRSTRING("out of memory"); + errexit = true; + } atomic_inc(&n_init); // Wait for all threads to start. @@ -354,7 +358,7 @@ static int main_func(void *arg) // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { - if (!result_avg) + if (errexit) break; if (torture_must_stop()) goto end; @@ -391,13 +395,13 @@ static int main_func(void *arg) strcat(buf, "Threads\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { - if (!result_avg) + if (errexit) break; sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); strcat(buf, buf1); } - if (result_avg) + if (!errexit) PERFOUT("%s", buf); // This will shutdown everything including us. @@ -412,6 +416,8 @@ static int main_func(void *arg) end: torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); return 0; } -- cgit v1.2.3 From 2e90de76f226f11fe26c871aa321be28152f565a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:45:03 -0700 Subject: refperf: Dynamically allocate thread-summary output buffer Currently, the buffer used to accumulate the thread-summary output is fixed size, which will cause problems if someone decides to run on a large number of PCUs. This commit therefore dynamically allocates this buffer. [ paulmck: Fix memory allocation as suggested by KASAN. ] Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 75b9cceaece1..fc940e3dba1f 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -301,9 +301,12 @@ u64 process_durations(int n) int i; struct reader_task *rt; char buf1[64]; - char buf[512]; + char *buf; u64 sum = 0; + buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + if (!buf) + return 0; buf[0] = 0; sprintf(buf, "Experiment #%d (Format: :)", exp_idx); @@ -322,6 +325,7 @@ u64 process_durations(int n) PERFOUT("%s\n", buf); + kfree(buf); return sum; } -- cgit v1.2.3 From 2990750bceb05c3cdeae3a6d2683cbc4ae4de15e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 09:32:57 -0700 Subject: refperf: Make functions static Because the reset_readers() and process_durations() functions are used only within kernel/rcu/refperf.c, this commit makes them static. Reported-by: kbuild test robot Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index fc940e3dba1f..0a900f3ae151 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -283,7 +283,7 @@ end: return 0; } -void reset_readers(void) +static void reset_readers(void) { int i; struct reader_task *rt; @@ -296,7 +296,7 @@ void reset_readers(void) } // Print the results of each reader and return the sum of all their durations. -u64 process_durations(int n) +static u64 process_durations(int n) { int i; struct reader_task *rt; -- cgit v1.2.3 From b864f89ff61492f56b4e8c6713a5efec6540a0e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 10:57:34 -0700 Subject: refperf: Tune reader measurement interval This commit moves a printk() out of the measurement interval, converts a atomic_dec()/atomic_read() pair to atomic_dec_and_test(), and adds a smp_mb__before_atomic() to avoid potential wake/wait hangs. These changes have the added benefit of reducing the number of loops required for amortizing loop overhead for CONFIG_PREEMPT=n RCU measurements from 1,000,000 to 10,000. This reduction in turn shortens the test, reducing the probability of interference. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 0a900f3ae151..8815ccfb6f98 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -252,15 +252,16 @@ repeat: // Make sure that the CPU is affinitized appropriately during testing. WARN_ON_ONCE(smp_processor_id() != me); + smp_mb__before_atomic(); atomic_dec(&rt->start); + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); + // To prevent noise, keep interrupts disabled. This also has the // effect of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - cur_ops->readsection(loops); duration = ktime_get_mono_fast_ns() - start; @@ -268,14 +269,12 @@ repeat: rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; - atomic_dec(&nreaders_exp); + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", me, exp_idx, atomic_read(&nreaders_exp)); - if (!atomic_read(&nreaders_exp)) - wake_up(&main_wq); - if (!torture_must_stop()) goto repeat; end: -- cgit v1.2.3 From af2789db13b8dc38d16e969f8c11b9468be42d46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 11:22:03 -0700 Subject: refperf: Convert reader_task structure's "start" field to int This commit converts the reader_task structure's "start" field to int in order to demote a full barrier to an smp_load_acquire() and also to simplify the code a bit. While in the area, and to enlist the compiler's help in ensuring that nothing was missed, the field's name was changed to start_reader. Also while in the area, change the main_func() store to use smp_store_release() to further fortify against wait/wake races. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 8815ccfb6f98..2fd3ed1a0d0d 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -80,7 +80,7 @@ torture_param(bool, shutdown, REFPERF_SHUTDOWN, struct reader_task { struct task_struct *task; - atomic_t start; + int start_reader; wait_queue_head_t wq; u64 last_duration_ns; }; @@ -243,7 +243,7 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); // Wait for signal that this reader can start. - wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) || + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || torture_must_stop()); if (torture_must_stop()) @@ -252,8 +252,7 @@ repeat: // Make sure that the CPU is affinitized appropriately during testing. WARN_ON_ONCE(smp_processor_id() != me); - smp_mb__before_atomic(); - atomic_dec(&rt->start); + WRITE_ONCE(rt->start_reader, 0); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); @@ -372,7 +371,7 @@ static int main_func(void *arg) exp_idx = exp; for (r = 0; r < nreaders; r++) { - atomic_set(&reader_tasks[r].start, 1); + smp_store_release(&reader_tasks[r].start_reader, 1); wake_up(&reader_tasks[r].wq); } -- cgit v1.2.3 From 86e0da2bb8ed934d3dce5a337895f1118f59c087 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 11:40:52 -0700 Subject: refperf: More closely synchronize reader start times Currently, readers are awakened individually. On most systems, this results in significant wakeup delay from one reader to the next, which can result in the first and last reader having sole access to the synchronization primitive in question. If that synchronization primitive involves shared memory, those readers will rack up a huge number of operations in a very short time, causing large perturbations in the results. This commit therefore has the readers busy-wait after being awakened, and uses a new n_started variable to synchronize their start times. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2fd3ed1a0d0d..234bb0e84a8b 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -99,6 +99,7 @@ static atomic_t nreaders_exp; // Use to wait for all threads to start. static atomic_t n_init; +static atomic_t n_started; // Track which experiment is currently running. static int exp_idx; @@ -253,6 +254,9 @@ repeat: WARN_ON_ONCE(smp_processor_id() != me); WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); @@ -367,6 +371,7 @@ static int main_func(void *arg) reset_readers(); atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); exp_idx = exp; -- cgit v1.2.3 From 2db0bda38453f472640f4ece1e2a495cbd44f892 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 12:34:57 -0700 Subject: refperf: Add warmup and cooldown processing phases This commit causes all the readers to start running unmeasured load until all readers have done at least one such run (thus having warmed up), then run the measured load, and then run unmeasured load until all readers have completed their measured load. This approach avoids any thread running measured load while other readers are idle. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 234bb0e84a8b..445190b97b05 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -100,6 +100,8 @@ static atomic_t nreaders_exp; // Use to wait for all threads to start. static atomic_t n_init; static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; // Track which experiment is currently running. static int exp_idx; @@ -260,8 +262,15 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - // To prevent noise, keep interrupts disabled. This also has the - // effect of preventing entries into slow path for rcu_read_unlock(). + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + cur_ops->readsection(loops); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + cur_ops->readsection(loops); + // Also keep interrupts disabled. This also has the effect + // of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); @@ -271,6 +280,11 @@ repeat: local_irq_restore(flags); rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + cur_ops->readsection(loops); if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); @@ -372,6 +386,8 @@ static int main_func(void *arg) reset_readers(); atomic_set(&nreaders_exp, nreaders); atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); exp_idx = exp; -- cgit v1.2.3 From 6efb06340846c788336f402e3a472a24fabb431e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 14:26:25 -0700 Subject: refperf: Label experiment-number column "Runs" The experiment-number column is currently labeled "Threads", which is misleading at best. This commit therefore relabels it as "Runs", and adjusts the scripts accordingly. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 445190b97b05..2d2d227d761a 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -415,7 +415,7 @@ static int main_func(void *arg) buf[0] = 0; strcat(buf, "\n"); - strcat(buf, "Threads\tTime(ns)\n"); + strcat(buf, "Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { if (errexit) -- cgit v1.2.3 From 96af8669591d740a1e2695c4d96e544409dbf896 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 May 2020 16:46:56 -0700 Subject: refperf: Simplify initialization-time wakeup protocol This commit moves the reader-launch wait loop from ref_perf_init() to main_func(), removing one layer of wakeup and allowing slightly faster system boot. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2d2d227d761a..7839237ffc17 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -369,13 +369,14 @@ static int main_func(void *arg) VERBOSE_PERFOUT_ERRSTRING("out of memory"); errexit = true; } - atomic_inc(&n_init); - - // Wait for all threads to start. - wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { if (errexit) @@ -565,14 +566,6 @@ ref_perf_init(void) firsterr = torture_create_kthread(main_func, NULL, main_task); if (firsterr) goto unwind; - schedule_timeout_uninterruptible(1); - - - // Wait until all threads start - while (atomic_read(&n_init) < nreaders + 1) - schedule_timeout_uninterruptible(1); - - wake_up(&main_wq); torture_init_end(); return 0; -- cgit v1.2.3 From b4d1e34f6502a138e32275baabdb6d593d7ea432 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 16:37:35 -0700 Subject: refperf: Add read-side delay module parameter This commit adds a refperf.readdelay module parameter that controls the duration of each critical section. This parameter allows gathering data showing how the performance differences between the various primitives vary with critical-section length. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 108 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 89 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 7839237ffc17..57a750bbcaca 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -66,8 +66,8 @@ torture_param(long, loops, 10000000, "Number of loops per experiment."); torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. torture_param(int, nruns, 30, "Number of experiments to run."); -// Reader delay in nanoseconds, 0 for no delay. -torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); +// Reader delay in microseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in microseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -111,6 +111,7 @@ struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int ndelay); const char *name; }; @@ -126,6 +127,17 @@ static void ref_rcu_read_section(const int nloops) } } +static void ref_rcu_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + udelay(ndelay); + rcu_read_unlock(); + } +} + static void rcu_sync_perf_init(void) { } @@ -133,6 +145,7 @@ static void rcu_sync_perf_init(void) static struct ref_perf_ops rcu_ops = { .init = rcu_sync_perf_init, .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, .name = "rcu" }; @@ -141,7 +154,7 @@ static struct ref_perf_ops rcu_ops = { DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; -static void srcu_ref_perf_read_section(int nloops) +static void srcu_ref_perf_read_section(const int nloops) { int i; int idx; @@ -152,16 +165,29 @@ static void srcu_ref_perf_read_section(int nloops) } } +static void srcu_ref_perf_delay_section(const int nloops, const int ndelay) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + udelay(ndelay); + srcu_read_unlock(srcu_ctlp, idx); + } +} + static struct ref_perf_ops srcu_ops = { .init = rcu_sync_perf_init, .readsection = srcu_ref_perf_read_section, + .delaysection = srcu_ref_perf_delay_section, .name = "srcu" }; // Definitions for reference count static atomic_t refcnt; -static void ref_perf_refcnt_section(const int nloops) +static void ref_refcnt_section(const int nloops) { int i; @@ -171,45 +197,69 @@ static void ref_perf_refcnt_section(const int nloops) } } +static void ref_refcnt_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + udelay(ndelay); + atomic_dec(&refcnt); + } +} + static struct ref_perf_ops refcnt_ops = { .init = rcu_sync_perf_init, - .readsection = ref_perf_refcnt_section, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, .name = "refcnt" }; // Definitions for rwlock static rwlock_t test_rwlock; -static void ref_perf_rwlock_init(void) +static void ref_rwlock_init(void) { rwlock_init(&test_rwlock); } -static void ref_perf_rwlock_section(const int nloops) +static void ref_rwlock_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } +} + +static void ref_rwlock_delay_section(const int nloops, const int ndelay) { int i; for (i = nloops; i >= 0; i--) { read_lock(&test_rwlock); + udelay(ndelay); read_unlock(&test_rwlock); } } static struct ref_perf_ops rwlock_ops = { - .init = ref_perf_rwlock_init, - .readsection = ref_perf_rwlock_section, + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, .name = "rwlock" }; // Definitions for rwsem static struct rw_semaphore test_rwsem; -static void ref_perf_rwsem_init(void) +static void ref_rwsem_init(void) { init_rwsem(&test_rwsem); } -static void ref_perf_rwsem_section(const int nloops) +static void ref_rwsem_section(const int nloops) { int i; @@ -219,12 +269,32 @@ static void ref_perf_rwsem_section(const int nloops) } } +static void ref_rwsem_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + udelay(ndelay); + up_read(&test_rwsem); + } +} + static struct ref_perf_ops rwsem_ops = { - .init = ref_perf_rwsem_init, - .readsection = ref_perf_rwsem_section, + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, .name = "rwsem" }; +static void rcu_perf_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay); +} + // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int @@ -265,16 +335,16 @@ repeat: // To reduce noise, do an initial cache-warming invocation, check // in, and then keep warming until everyone has checked in. - cur_ops->readsection(loops); + rcu_perf_one_reader(); if (!atomic_dec_return(&n_warmedup)) while (atomic_read_acquire(&n_warmedup)) - cur_ops->readsection(loops); + rcu_perf_one_reader(); // Also keep interrupts disabled. This also has the effect // of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); - cur_ops->readsection(loops); + rcu_perf_one_reader(); duration = ktime_get_mono_fast_ns() - start; local_irq_restore(flags); @@ -284,7 +354,7 @@ repeat: // everyone is done. if (!atomic_dec_return(&n_cooleddown)) while (atomic_read_acquire(&n_cooleddown)) - cur_ops->readsection(loops); + rcu_perf_one_reader(); if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); @@ -449,8 +519,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); } static void -- cgit v1.2.3 From 4dd72a338a07486823037a6b45334d05192c913a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 29 May 2020 13:11:26 -0700 Subject: refperf: Adjust refperf.loop default value With the various measurement optimizations, 10,000 loops normally suffices. This commit therefore reduces the refperf.loops default value from 10,000,000 to 10,000. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 57a750bbcaca..063eeb0473a1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -61,7 +61,7 @@ torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000000, "Number of loops per experiment."); +torture_param(long, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. -- cgit v1.2.3 From 7c944d7c67daee84e3c756bb74ad2f32b28c41cf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 May 2020 14:36:26 -0700 Subject: refperf: Work around 64-bit division MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 64-bit division was introduced in refperf, breaking compilation on all 32-bit architectures: kernel/rcu/refperf.o: in function `main_func': refperf.c:(.text+0x57c): undefined reference to `__aeabi_uldivmod' Fix this by using div_u64 to mark the expensive operation. [ paulmck: Update primitive and format per Nathan Chancellor. ] Fixes: bd5b16d6c88d ("refperf: Allow decimal nanoseconds") Reported-by: kbuild test robot Reported-by: Valdis Klētnieks Acked-by: Randy Dunlap # build-tested Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 063eeb0473a1..80d449060bdf 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -478,7 +478,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops); + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); } // Print the average of all experiments @@ -489,9 +489,13 @@ static int main_func(void *arg) strcat(buf, "Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + if (errexit) break; - sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); strcat(buf, buf1); } -- cgit v1.2.3 From 918b351d965560c7902ad482cf87049517843ff2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 31 May 2020 18:14:57 -0700 Subject: refperf: Change readdelay module parameter to nanoseconds The current units of microseconds are too coarse, so this commit changes the units to nanoseconds. However, ndelay is used only for the nanoseconds with udelay being used for whole microseconds. For example, setting refperf.readdelay=1500 results in a udelay(1) followed by an ndelay(500). Suggested-by: Akira Yokosawa [ paulmck: Abstracted delay per Akira feedback and move from 80 to 100 lines. ] [ paulmck: Fix names as suggested by kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 80d449060bdf..49fffb9bce77 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -66,8 +66,8 @@ torture_param(long, loops, 10000, "Number of loops per experiment."); torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. torture_param(int, nruns, 30, "Number of experiments to run."); -// Reader delay in microseconds, 0 for no delay. -torture_param(int, readdelay, 0, "Read-side delay in microseconds."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -111,12 +111,20 @@ struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); - void (*delaysection)(const int nloops, const int ndelay); + void (*delaysection)(const int nloops, const int udl, const int ndl); const char *name; }; static struct ref_perf_ops *cur_ops; +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + static void ref_rcu_read_section(const int nloops) { int i; @@ -127,13 +135,13 @@ static void ref_rcu_read_section(const int nloops) } } -static void ref_rcu_delay_section(const int nloops, const int ndelay) +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { rcu_read_lock(); - udelay(ndelay); + un_delay(udl, ndl); rcu_read_unlock(); } } @@ -165,14 +173,14 @@ static void srcu_ref_perf_read_section(const int nloops) } } -static void srcu_ref_perf_delay_section(const int nloops, const int ndelay) +static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl) { int i; int idx; for (i = nloops; i >= 0; i--) { idx = srcu_read_lock(srcu_ctlp); - udelay(ndelay); + un_delay(udl, ndl); srcu_read_unlock(srcu_ctlp, idx); } } @@ -197,13 +205,13 @@ static void ref_refcnt_section(const int nloops) } } -static void ref_refcnt_delay_section(const int nloops, const int ndelay) +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { atomic_inc(&refcnt); - udelay(ndelay); + un_delay(udl, ndl); atomic_dec(&refcnt); } } @@ -233,13 +241,13 @@ static void ref_rwlock_section(const int nloops) } } -static void ref_rwlock_delay_section(const int nloops, const int ndelay) +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { read_lock(&test_rwlock); - udelay(ndelay); + un_delay(udl, ndl); read_unlock(&test_rwlock); } } @@ -269,13 +277,13 @@ static void ref_rwsem_section(const int nloops) } } -static void ref_rwsem_delay_section(const int nloops, const int ndelay) +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { down_read(&test_rwsem); - udelay(ndelay); + un_delay(udl, ndl); up_read(&test_rwsem); } } @@ -292,7 +300,7 @@ static void rcu_perf_one_reader(void) if (readdelay <= 0) cur_ops->readsection(loops); else - cur_ops->delaysection(loops, readdelay); + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); } // Reader kthread. Repeatedly does empty RCU read-side -- cgit v1.2.3 From 72bb749e7048d0a8d7663b59ec1a33bd56c51083 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jun 2020 08:34:41 -0700 Subject: refperf: Add test for RCU Tasks Trace readers. This commit adds testing for RCU Tasks Trace readers to the refperf module. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 49fffb9bce77..da7de9ac548d 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -157,7 +158,6 @@ static struct ref_perf_ops rcu_ops = { .name = "rcu" }; - // Definitions for SRCU ref perf testing. DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; @@ -192,6 +192,35 @@ static struct ref_perf_ops srcu_ops = { .name = "srcu" }; +// Definitions for RCU Tasks Trace ref perf testing. +static void rcu_trace_ref_perf_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static struct ref_perf_ops rcu_trace_ops = { + .init = rcu_sync_perf_init, + .readsection = rcu_trace_ref_perf_read_section, + .delaysection = rcu_trace_ref_perf_delay_section, + .name = "rcu-trace" +}; + // Definitions for reference count static atomic_t refcnt; @@ -584,7 +613,7 @@ ref_perf_init(void) long i; int firsterr = 0; static struct ref_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, }; if (!torture_init_begin(perf_type, verbose)) -- cgit v1.2.3 From e13ef442fe522fa1f604efec8c899a0e1fc3d426 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jun 2020 11:56:34 -0700 Subject: refperf: Add test for RCU Tasks readers This commit adds testing for RCU Tasks readers to the refperf module. This also applies to RCU Rude readers, as both flavors have empty (as in non-existent) read-side markers. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index da7de9ac548d..2bfdcdcb6bd1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -192,6 +192,31 @@ static struct ref_perf_ops srcu_ops = { .name = "srcu" }; +// Definitions for RCU Tasks ref perf testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_perf_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static struct ref_perf_ops rcu_tasks_ops = { + .init = rcu_sync_perf_init, + .readsection = rcu_tasks_ref_perf_read_section, + .delaysection = rcu_tasks_ref_perf_delay_section, + .name = "rcu-tasks" +}; + // Definitions for RCU Tasks Trace ref perf testing. static void rcu_trace_ref_perf_read_section(const int nloops) { @@ -613,7 +638,8 @@ ref_perf_init(void) long i; int firsterr = 0; static struct ref_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, }; if (!torture_init_begin(perf_type, verbose)) -- cgit v1.2.3 From c7dcf8106f7570b133b05ff68fd4100064965d9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Jun 2020 13:11:29 -0700 Subject: rcu-tasks: Fix synchronize_rcu_tasks_trace() header comment The synchronize_rcu_tasks_trace() header comment incorrectly claims that any number of things delimit RCU Tasks Trace read-side critical sections, when in fact only rcu_read_lock_trace() and rcu_read_unlock_trace() do so. This commit therefore fixes this comment, and, while in the area, fixes a typo in the rcu_read_lock_trace() header comment. Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce23f6cc5043..a77298c1d126 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1118,11 +1118,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period * * Control will return to the caller some time after a trace rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, - * anyway) cond_resched(). + * grace period has elapsed, in other words after all currently executing + * rcu-tasks read-side critical sections have elapsed. These read-side + * critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). * * This is a very specialized primitive, intended only for a few uses in * tracing and other situations requiring manipulation of function preambles -- cgit v1.2.3 From 8e4ec3d02b549a731c94b4bcddff212bb92cdbaf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 11:33:54 -0700 Subject: refperf: Rename RCU_REF_PERF_TEST to RCU_REF_SCALE_TEST The old Kconfig option name is all too easy to conflate with the unrelated "perf" feature, so this commit renames RCU_REF_PERF_TEST to RCU_REF_SCALE_TEST. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 4 ++-- kernel/rcu/Makefile | 2 +- kernel/rcu/refperf.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 858765b7f644..3cf6132a4bb9 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -61,8 +61,8 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_REF_PERF_TEST - tristate "Performance tests for read-side synchronization (RCU and others)" +config RCU_REF_SCALE_TEST + tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL select TORTURE_TEST select SRCU diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index ba7d82609cbe..45d562de279a 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o -obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2bfdcdcb6bd1..7c980573acbe 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ // -// Performance test comparing RCU vs other mechanisms +// Scalability test comparing RCU vs other mechanisms // for acquiring references on objects. // // Copyright (C) Google, 2020. @@ -59,7 +59,7 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); // Wait until there are multiple CPUs before starting test. -torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000, "Number of loops per experiment."); @@ -656,7 +656,7 @@ ref_perf_init(void) for (i = 0; i < ARRAY_SIZE(perf_ops); i++) pr_cont(" %s", perf_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST)); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; -- cgit v1.2.3 From 1fbeb3a8c4de29433a8d230ee600b13d369b6c0f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 11:53:53 -0700 Subject: refperf: Rename refperf.c to refscale.c and change internal names This commit further avoids conflation of refperf with the kernel's perf feature by renaming kernel/rcu/refperf.c to kernel/rcu/refscale.c, and also by similarly renaming the functions and variables inside this file. This has the side effect of changing the names of the kernel boot parameters, so kernel-parameters.txt and ver_functions.sh are also updated. The rcutorture --torture type remains refperf, and this will be addressed in a separate commit. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/Makefile | 2 +- kernel/rcu/refperf.c | 717 -------------------------------------------------- kernel/rcu/refscale.c | 717 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 718 insertions(+), 718 deletions(-) delete mode 100644 kernel/rcu/refperf.c create mode 100644 kernel/rcu/refscale.c (limited to 'kernel') diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 45d562de279a..95f5117ef8da 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o -obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c deleted file mode 100644 index 7c980573acbe..000000000000 --- a/kernel/rcu/refperf.c +++ /dev/null @@ -1,717 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -// -// Scalability test comparing RCU vs other mechanisms -// for acquiring references on objects. -// -// Copyright (C) Google, 2020. -// -// Author: Joel Fernandes - -#define pr_fmt(fmt) fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rcu.h" - -#define PERF_FLAG "-ref-perf: " - -#define PERFOUT(s, x...) \ - pr_alert("%s" PERF_FLAG s, perf_type, ## x) - -#define VERBOSE_PERFOUT(s, x...) \ - do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0) - -#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0) - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Joel Fernandes (Google) "); - -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); - -torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); - -// Wait until there are multiple CPUs before starting test. -torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, - "Holdoff time before test start (s)"); -// Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000, "Number of loops per experiment."); -// Number of readers, with -1 defaulting to about 75% of the CPUs. -torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); -// Number of runs. -torture_param(int, nruns, 30, "Number of experiments to run."); -// Reader delay in nanoseconds, 0 for no delay. -torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); - -#ifdef MODULE -# define REFPERF_SHUTDOWN 0 -#else -# define REFPERF_SHUTDOWN 1 -#endif - -torture_param(bool, shutdown, REFPERF_SHUTDOWN, - "Shutdown at end of performance tests."); - -struct reader_task { - struct task_struct *task; - int start_reader; - wait_queue_head_t wq; - u64 last_duration_ns; -}; - -static struct task_struct *shutdown_task; -static wait_queue_head_t shutdown_wq; - -static struct task_struct *main_task; -static wait_queue_head_t main_wq; -static int shutdown_start; - -static struct reader_task *reader_tasks; - -// Number of readers that are part of the current experiment. -static atomic_t nreaders_exp; - -// Use to wait for all threads to start. -static atomic_t n_init; -static atomic_t n_started; -static atomic_t n_warmedup; -static atomic_t n_cooleddown; - -// Track which experiment is currently running. -static int exp_idx; - -// Operations vector for selecting different types of tests. -struct ref_perf_ops { - void (*init)(void); - void (*cleanup)(void); - void (*readsection)(const int nloops); - void (*delaysection)(const int nloops, const int udl, const int ndl); - const char *name; -}; - -static struct ref_perf_ops *cur_ops; - -static void un_delay(const int udl, const int ndl) -{ - if (udl) - udelay(udl); - if (ndl) - ndelay(ndl); -} - -static void ref_rcu_read_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - rcu_read_lock(); - rcu_read_unlock(); - } -} - -static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - rcu_read_lock(); - un_delay(udl, ndl); - rcu_read_unlock(); - } -} - -static void rcu_sync_perf_init(void) -{ -} - -static struct ref_perf_ops rcu_ops = { - .init = rcu_sync_perf_init, - .readsection = ref_rcu_read_section, - .delaysection = ref_rcu_delay_section, - .name = "rcu" -}; - -// Definitions for SRCU ref perf testing. -DEFINE_STATIC_SRCU(srcu_refctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; - -static void srcu_ref_perf_read_section(const int nloops) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock(srcu_ctlp); - srcu_read_unlock(srcu_ctlp, idx); - } -} - -static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock(srcu_ctlp); - un_delay(udl, ndl); - srcu_read_unlock(srcu_ctlp, idx); - } -} - -static struct ref_perf_ops srcu_ops = { - .init = rcu_sync_perf_init, - .readsection = srcu_ref_perf_read_section, - .delaysection = srcu_ref_perf_delay_section, - .name = "srcu" -}; - -// Definitions for RCU Tasks ref perf testing: Empty read markers. -// These definitions also work for RCU Rude readers. -static void rcu_tasks_ref_perf_read_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) - continue; -} - -static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) - un_delay(udl, ndl); -} - -static struct ref_perf_ops rcu_tasks_ops = { - .init = rcu_sync_perf_init, - .readsection = rcu_tasks_ref_perf_read_section, - .delaysection = rcu_tasks_ref_perf_delay_section, - .name = "rcu-tasks" -}; - -// Definitions for RCU Tasks Trace ref perf testing. -static void rcu_trace_ref_perf_read_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - rcu_read_lock_trace(); - rcu_read_unlock_trace(); - } -} - -static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - rcu_read_lock_trace(); - un_delay(udl, ndl); - rcu_read_unlock_trace(); - } -} - -static struct ref_perf_ops rcu_trace_ops = { - .init = rcu_sync_perf_init, - .readsection = rcu_trace_ref_perf_read_section, - .delaysection = rcu_trace_ref_perf_delay_section, - .name = "rcu-trace" -}; - -// Definitions for reference count -static atomic_t refcnt; - -static void ref_refcnt_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - atomic_inc(&refcnt); - atomic_dec(&refcnt); - } -} - -static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - atomic_inc(&refcnt); - un_delay(udl, ndl); - atomic_dec(&refcnt); - } -} - -static struct ref_perf_ops refcnt_ops = { - .init = rcu_sync_perf_init, - .readsection = ref_refcnt_section, - .delaysection = ref_refcnt_delay_section, - .name = "refcnt" -}; - -// Definitions for rwlock -static rwlock_t test_rwlock; - -static void ref_rwlock_init(void) -{ - rwlock_init(&test_rwlock); -} - -static void ref_rwlock_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - read_lock(&test_rwlock); - read_unlock(&test_rwlock); - } -} - -static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - read_lock(&test_rwlock); - un_delay(udl, ndl); - read_unlock(&test_rwlock); - } -} - -static struct ref_perf_ops rwlock_ops = { - .init = ref_rwlock_init, - .readsection = ref_rwlock_section, - .delaysection = ref_rwlock_delay_section, - .name = "rwlock" -}; - -// Definitions for rwsem -static struct rw_semaphore test_rwsem; - -static void ref_rwsem_init(void) -{ - init_rwsem(&test_rwsem); -} - -static void ref_rwsem_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - down_read(&test_rwsem); - up_read(&test_rwsem); - } -} - -static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - down_read(&test_rwsem); - un_delay(udl, ndl); - up_read(&test_rwsem); - } -} - -static struct ref_perf_ops rwsem_ops = { - .init = ref_rwsem_init, - .readsection = ref_rwsem_section, - .delaysection = ref_rwsem_delay_section, - .name = "rwsem" -}; - -static void rcu_perf_one_reader(void) -{ - if (readdelay <= 0) - cur_ops->readsection(loops); - else - cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); -} - -// Reader kthread. Repeatedly does empty RCU read-side -// critical section, minimizing update-side interference. -static int -ref_perf_reader(void *arg) -{ - unsigned long flags; - long me = (long)arg; - struct reader_task *rt = &(reader_tasks[me]); - u64 start; - s64 duration; - - VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - set_user_nice(current, MAX_NICE); - atomic_inc(&n_init); - if (holdoff) - schedule_timeout_interruptible(holdoff * HZ); -repeat: - VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); - - // Wait for signal that this reader can start. - wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || - torture_must_stop()); - - if (torture_must_stop()) - goto end; - - // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(smp_processor_id() != me); - - WRITE_ONCE(rt->start_reader, 0); - if (!atomic_dec_return(&n_started)) - while (atomic_read_acquire(&n_started)) - cpu_relax(); - - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - - - // To reduce noise, do an initial cache-warming invocation, check - // in, and then keep warming until everyone has checked in. - rcu_perf_one_reader(); - if (!atomic_dec_return(&n_warmedup)) - while (atomic_read_acquire(&n_warmedup)) - rcu_perf_one_reader(); - // Also keep interrupts disabled. This also has the effect - // of preventing entries into slow path for rcu_read_unlock(). - local_irq_save(flags); - start = ktime_get_mono_fast_ns(); - - rcu_perf_one_reader(); - - duration = ktime_get_mono_fast_ns() - start; - local_irq_restore(flags); - - rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; - // To reduce runtime-skew noise, do maintain-load invocations until - // everyone is done. - if (!atomic_dec_return(&n_cooleddown)) - while (atomic_read_acquire(&n_cooleddown)) - rcu_perf_one_reader(); - - if (atomic_dec_and_test(&nreaders_exp)) - wake_up(&main_wq); - - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", - me, exp_idx, atomic_read(&nreaders_exp)); - - if (!torture_must_stop()) - goto repeat; -end: - torture_kthread_stopping("ref_perf_reader"); - return 0; -} - -static void reset_readers(void) -{ - int i; - struct reader_task *rt; - - for (i = 0; i < nreaders; i++) { - rt = &(reader_tasks[i]); - - rt->last_duration_ns = 0; - } -} - -// Print the results of each reader and return the sum of all their durations. -static u64 process_durations(int n) -{ - int i; - struct reader_task *rt; - char buf1[64]; - char *buf; - u64 sum = 0; - - buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); - if (!buf) - return 0; - buf[0] = 0; - sprintf(buf, "Experiment #%d (Format: :)", - exp_idx); - - for (i = 0; i < n && !torture_must_stop(); i++) { - rt = &(reader_tasks[i]); - sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); - - if (i % 5 == 0) - strcat(buf, "\n"); - strcat(buf, buf1); - - sum += rt->last_duration_ns; - } - strcat(buf, "\n"); - - PERFOUT("%s\n", buf); - - kfree(buf); - return sum; -} - -// The main_func is the main orchestrator, it performs a bunch of -// experiments. For every experiment, it orders all the readers -// involved to start and waits for them to finish the experiment. It -// then reads their timestamps and starts the next experiment. Each -// experiment progresses from 1 concurrent reader to N of them at which -// point all the timestamps are printed. -static int main_func(void *arg) -{ - bool errexit = false; - int exp, r; - char buf1[64]; - char *buf; - u64 *result_avg; - - set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); - set_user_nice(current, MAX_NICE); - - VERBOSE_PERFOUT("main_func task started"); - result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); - buf = kzalloc(64 + nruns * 32, GFP_KERNEL); - if (!result_avg || !buf) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); - errexit = true; - } - if (holdoff) - schedule_timeout_interruptible(holdoff * HZ); - - // Wait for all threads to start. - atomic_inc(&n_init); - while (atomic_read(&n_init) < nreaders + 1) - schedule_timeout_uninterruptible(1); - - // Start exp readers up per experiment - for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { - if (errexit) - break; - if (torture_must_stop()) - goto end; - - reset_readers(); - atomic_set(&nreaders_exp, nreaders); - atomic_set(&n_started, nreaders); - atomic_set(&n_warmedup, nreaders); - atomic_set(&n_cooleddown, nreaders); - - exp_idx = exp; - - for (r = 0; r < nreaders; r++) { - smp_store_release(&reader_tasks[r].start_reader, 1); - wake_up(&reader_tasks[r].wq); - } - - VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", - nreaders); - - wait_event(main_wq, - !atomic_read(&nreaders_exp) || torture_must_stop()); - - VERBOSE_PERFOUT("main_func: experiment ended"); - - if (torture_must_stop()) - goto end; - - result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); - } - - // Print the average of all experiments - PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); - - for (exp = 0; exp < nruns; exp++) { - u64 avg; - u32 rem; - - if (errexit) - break; - avg = div_u64_rem(result_avg[exp], 1000, &rem); - sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); - strcat(buf, buf1); - } - - if (!errexit) - PERFOUT("%s", buf); - - // This will shutdown everything including us. - if (shutdown) { - shutdown_start = 1; - wake_up(&shutdown_wq); - } - - // Wait for torture to stop us - while (!torture_must_stop()) - schedule_timeout_uninterruptible(1); - -end: - torture_kthread_stopping("main_func"); - kfree(result_avg); - kfree(buf); - return 0; -} - -static void -ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) -{ - pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); -} - -static void -ref_perf_cleanup(void) -{ - int i; - - if (torture_cleanup_begin()) - return; - - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nreaders; i++) - torture_stop_kthread("ref_perf_reader", - reader_tasks[i].task); - } - kfree(reader_tasks); - - torture_stop_kthread("main_task", main_task); - kfree(main_task); - - // Do perf-type-specific cleanup operations. - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); -} - -// Shutdown kthread. Just waits to be awakened, then shuts down system. -static int -ref_perf_shutdown(void *arg) -{ - wait_event(shutdown_wq, shutdown_start); - - smp_mb(); // Wake before output. - ref_perf_cleanup(); - kernel_power_off(); - - return -EINVAL; -} - -static int __init -ref_perf_init(void) -{ - long i; - int firsterr = 0; - static struct ref_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, - &refcnt_ops, &rwlock_ops, &rwsem_ops, - }; - - if (!torture_init_begin(perf_type, verbose)) - return -EBUSY; - - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); - pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); - firsterr = -EINVAL; - cur_ops = NULL; - goto unwind; - } - if (cur_ops->init) - cur_ops->init(); - - ref_perf_print_module_parms(cur_ops, "Start of test"); - - // Shutdown task - if (shutdown) { - init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(ref_perf_shutdown, NULL, - shutdown_task); - if (firsterr) - goto unwind; - schedule_timeout_uninterruptible(1); - } - - // Reader tasks (default to ~75% of online CPUs). - if (nreaders < 0) - nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); - reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), - GFP_KERNEL); - if (!reader_tasks) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - - VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders); - - for (i = 0; i < nreaders; i++) { - firsterr = torture_create_kthread(ref_perf_reader, (void *)i, - reader_tasks[i].task); - if (firsterr) - goto unwind; - - init_waitqueue_head(&(reader_tasks[i].wq)); - } - - // Main Task - init_waitqueue_head(&main_wq); - firsterr = torture_create_kthread(main_func, NULL, main_task); - if (firsterr) - goto unwind; - - torture_init_end(); - return 0; - -unwind: - torture_init_end(); - ref_perf_cleanup(); - return firsterr; -} - -module_init(ref_perf_init); -module_exit(ref_perf_cleanup); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c new file mode 100644 index 000000000000..d9291f883b54 --- /dev/null +++ b/kernel/rcu/refscale.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Scalability test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcu.h" + +#define SCALE_FLAG "-ref-scale: " + +#define SCALEOUT(s, x...) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x) + +#define VERBOSE_SCALEOUT(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) + +#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) "); + +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of loops per experiment, all readers execute operations concurrently. +torture_param(long, loops, 10000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); + +#ifdef MODULE +# define REFSCALE_SHUTDOWN 0 +#else +# define REFSCALE_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); + +struct reader_task { + struct task_struct *task; + int start_reader; + wait_queue_head_t wq; + u64 last_duration_ns; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; +static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_scale_ops { + void (*init)(void); + void (*cleanup)(void); + void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int udl, const int ndl); + const char *name; +}; + +static struct ref_scale_ops *cur_ops; + +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + +static void ref_rcu_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + un_delay(udl, ndl); + rcu_read_unlock(); + } +} + +static void rcu_sync_scale_init(void) +{ +} + +static struct ref_scale_ops rcu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, + .name = "rcu" +}; + +// Definitions for SRCU ref scale testing. +DEFINE_STATIC_SRCU(srcu_refctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; + +static void srcu_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static struct ref_scale_ops srcu_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_ref_scale_read_section, + .delaysection = srcu_ref_scale_delay_section, + .name = "srcu" +}; + +// Definitions for RCU Tasks ref scale testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static struct ref_scale_ops rcu_tasks_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_tasks_ref_scale_read_section, + .delaysection = rcu_tasks_ref_scale_delay_section, + .name = "rcu-tasks" +}; + +// Definitions for RCU Tasks Trace ref scale testing. +static void rcu_trace_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static struct ref_scale_ops rcu_trace_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_trace_ref_scale_read_section, + .delaysection = rcu_trace_ref_scale_delay_section, + .name = "rcu-trace" +}; + +// Definitions for reference count +static atomic_t refcnt; + +static void ref_refcnt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } +} + +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + un_delay(udl, ndl); + atomic_dec(&refcnt); + } +} + +static struct ref_scale_ops refcnt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, + .name = "refcnt" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static void ref_rwlock_init(void) +{ + rwlock_init(&test_rwlock); +} + +static void ref_rwlock_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } +} + +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + un_delay(udl, ndl); + read_unlock(&test_rwlock); + } +} + +static struct ref_scale_ops rwlock_ops = { + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static void ref_rwsem_init(void) +{ + init_rwsem(&test_rwsem); +} + +static void ref_rwsem_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } +} + +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + un_delay(udl, ndl); + up_read(&test_rwsem); + } +} + +static struct ref_scale_ops rwsem_ops = { + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, + .name = "rwsem" +}; + +static void rcu_scale_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); +} + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_scale_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + u64 start; + s64 duration; + + VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); +repeat: + VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != me); + + WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); + + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + rcu_scale_one_reader(); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + rcu_scale_one_reader(); + // Also keep interrupts disabled. This also has the effect + // of preventing entries into slow path for rcu_read_unlock(). + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + rcu_scale_one_reader(); + + duration = ktime_get_mono_fast_ns() - start; + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + rcu_scale_one_reader(); + + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_scale_reader"); + return 0; +} + +static void reset_readers(void) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < nreaders; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +static u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + char buf1[64]; + char *buf; + u64 sum = 0; + + buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + if (!buf) + return 0; + buf[0] = 0; + sprintf(buf, "Experiment #%d (Format: :)", + exp_idx); + + for (i = 0; i < n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); + + if (i % 5 == 0) + strcat(buf, "\n"); + strcat(buf, buf1); + + sum += rt->last_duration_ns; + } + strcat(buf, "\n"); + + SCALEOUT("%s\n", buf); + + kfree(buf); + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + bool errexit = false; + int exp, r; + char buf1[64]; + char *buf; + u64 *result_avg; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_SCALEOUT("main_func task started"); + result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + if (!result_avg || !buf) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + errexit = true; + } + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + // Start exp readers up per experiment + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (errexit) + break; + if (torture_must_stop()) + goto end; + + reset_readers(); + atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); + + exp_idx = exp; + + for (r = 0; r < nreaders; r++) { + smp_store_release(&reader_tasks[r].start_reader, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers", + nreaders); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_SCALEOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); + } + + // Print the average of all experiments + SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + + for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + + if (errexit) + break; + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); + strcat(buf, buf1); + } + + if (!errexit) + SCALEOUT("%s", buf); + + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); + return 0; +} + +static void +ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +{ + pr_alert("%s" SCALE_FLAG + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); +} + +static void +ref_scale_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_scale_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + + torture_stop_kthread("main_task", main_task); + kfree(main_task); + + // Do scale-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_scale_shutdown(void *arg) +{ + wait_event(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_scale_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_scale_init(void) +{ + long i; + int firsterr = 0; + static struct ref_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, + }; + + if (!torture_init_begin(scale_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + ref_scale_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_scale_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders); + + for (i = 0; i < nreaders; i++) { + firsterr = torture_create_kthread(ref_scale_reader, (void *)i, + reader_tasks[i].task); + if (firsterr) + goto unwind; + + init_waitqueue_head(&(reader_tasks[i].wq)); + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (firsterr) + goto unwind; + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_scale_cleanup(); + return firsterr; +} + +module_init(ref_scale_init); +module_exit(ref_scale_cleanup); -- cgit v1.2.3 From 7fef6cff8f2814bf8eb632e2bb8f0a987ffd9ece Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Sat, 18 Apr 2020 19:46:47 +0800 Subject: srcu: Fix a typo in comment "amoritized"->"amortized" This commit fixes a typo in a comment. Signed-off-by: Ethon Paul Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d3ef700fb0e..8ff71e5d0fe8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp) * it, if this function was preempted for enough time for the counters * to wrap, it really doesn't matter whether or not we expedite the grace * period. The extra overhead of a needlessly expedited grace period is - * negligible when amoritized over that time period, and the extra latency + * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ static bool srcu_might_be_idle(struct srcu_struct *ssp) -- cgit v1.2.3 From bde50d8ff83e4ce9e576f7c5ba1edb48a3610a5b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 26 May 2020 15:41:34 +0200 Subject: srcu: Avoid local_irq_save() before acquiring spinlock_t SRCU disables interrupts to get a stable per-CPU pointer and then acquires the spinlock which is in the per-CPU data structure. The release uses spin_unlock_irqrestore(). While this is correct on a non-RT kernel, this conflicts with the RT semantics because the spinlock is converted to a 'sleeping' spinlock. Sleeping locks can obviously not be acquired with interrupts disabled. Acquire the per-CPU pointer `ssp->sda' without disabling preemption and then acquire the spinlock_t of the per-CPU data structure. The lock will ensure that the data is consistent. The added call to check_init_srcu_struct() is now needed because a statically defined srcu_struct may remain uninitialized until this point and the newly introduced locking operation requires an initialized spinlock_t. This change was tested for four hours with 8*SRCU-N and 8*SRCU-P without causing any warnings. Cc: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8ff71e5d0fe8..c100acf332ed 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long t; unsigned long tlast; + check_init_srcu_struct(ssp); /* If the local srcu_data structure has callbacks, not idle. */ - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabalistically probe global state. @@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, } rhp->func = func; idx = srcu_read_lock(ssp); - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); - spin_lock_rcu_node(sdp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); -- cgit v1.2.3 From d02c6b52d12fa30eeabfaf5aefe12078eacb94b2 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Mon, 13 Apr 2020 20:02:59 +0800 Subject: locktorture: Use true and false to assign to bool variables This commit fixes the following coccicheck warnings: kernel/locking/locktorture.c:689:6-10: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:907:2-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:938:3-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:668:2-19: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:674:2-19: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:634:2-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:640:2-20: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..8ff6f50e06a0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; - lock_is_write_held = 1; + lock_is_write_held = true; if (WARN_ON_ONCE(lock_is_read_held)) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); - lock_is_write_held = 0; + lock_is_write_held = false; cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); @@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(); - lock_is_read_held = 1; + lock_is_read_held = true; if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = 0; + lock_is_read_held = false; cxt.cur_ops->readunlock(); stutter_wait("lock_torture_reader"); @@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { - bool fail = 0; + bool fail = false; int i, n_stress; long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; @@ -904,7 +904,7 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { - lock_is_write_held = 0; + lock_is_write_held = false; cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, sizeof(*cxt.lwsa), GFP_KERNEL); @@ -935,7 +935,7 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = 0; + lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); -- cgit v1.2.3 From 4a5f133c15b77c4018e8d7996541868ac94afb4f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Apr 2020 11:21:40 -0700 Subject: rcutorture: Add races with task-exit processing Several variants of Linux-kernel RCU interact with task-exit processing, including preemptible RCU, Tasks RCU, and Tasks Trace RCU. This commit therefore adds testing of this interaction to rcutorture by adding rcutorture.read_exit_burst and rcutorture.read_exit_delay kernel-boot parameters. These kernel parameters control the frequency and spacing of special read-then-exit kthreads that are spawned. [ paulmck: Apply feedback from Dan Carpenter's static checker. ] [ paulmck: Reduce latency to avoid false-positive shutdown hangs. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..2621a339c8a4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -109,6 +109,10 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, read_exit_delay, 13, + "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, + "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -146,6 +150,7 @@ static struct task_struct *stall_task; static struct task_struct *fwd_prog_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; +static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -177,6 +182,7 @@ static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ +static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; @@ -1539,10 +1545,11 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld\n", + pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); + pr_cont("read-exits: %ld\n", data_race(n_read_exits)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1634,7 +1641,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d " "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", + "onoff_interval=%d onoff_holdoff=%d " + "read_exit_delay=%d read_exit_burst=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -1643,7 +1651,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, n_barrier_cbs, - onoff_interval, onoff_holdoff); + onoff_interval, onoff_holdoff, + read_exit_delay, read_exit_burst); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2338,6 +2347,99 @@ static bool rcu_torture_can_boost(void) return true; } +static bool read_exit_child_stop; +static bool read_exit_child_stopped; +static wait_queue_head_t read_exit_wq; + +// Child kthread which just does an rcutorture reader and exits. +static int rcu_torture_read_exit_child(void *trsp_in) +{ + struct torture_random_state *trsp = trsp_in; + + set_user_nice(current, MAX_NICE); + // Minimize time between reading and exiting. + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + (void)rcu_torture_one_read(trsp); + return 0; +} + +// Parent kthread which creates and destroys read-exit child kthreads. +static int rcu_torture_read_exit(void *unused) +{ + int count = 0; + bool errexit = false; + int i; + struct task_struct *tsp; + DEFINE_TORTURE_RANDOM(trs); + + // Allocate and initialize. + set_user_nice(current, MAX_NICE); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test"); + + // Each pass through this loop does one read-exit episode. + do { + if (++count > read_exit_burst) { + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + for (i = 0; i < read_exit_delay; i++) { + schedule_timeout_uninterruptible(HZ); + if (READ_ONCE(read_exit_child_stop)) + break; + } + if (!READ_ONCE(read_exit_child_stop)) + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + count = 0; + } + if (READ_ONCE(read_exit_child_stop)) + break; + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", + "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + errexit = true; + tsp = NULL; + break; + } + cond_resched(); + kthread_stop(tsp); + n_read_exits ++; + stutter_wait("rcu_torture_read_exit"); + } while (!errexit && !READ_ONCE(read_exit_child_stop)); + + // Clean up and exit. + smp_store_release(&read_exit_child_stopped, true); // After reaping. + smp_mb(); // Store before wakeup. + wake_up(&read_exit_wq); + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_read_exit"); + return 0; +} + +static int rcu_torture_read_exit_init(void) +{ + if (read_exit_burst <= 0) + return -EINVAL; + init_waitqueue_head(&read_exit_wq); + read_exit_child_stop = false; + read_exit_child_stopped = false; + return torture_create_kthread(rcu_torture_read_exit, NULL, + read_exit_task); +} + +static void rcu_torture_read_exit_cleanup(void) +{ + if (!read_exit_task) + return; + WRITE_ONCE(read_exit_child_stop, true); + smp_mb(); // Above write before wait. + wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped)); + torture_stop_kthread(rcutorture_read_exit, read_exit_task); +} + static enum cpuhp_state rcutor_hp; static void @@ -2359,6 +2461,7 @@ rcu_torture_cleanup(void) } show_rcu_gp_kthreads(); + rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); @@ -2680,6 +2783,9 @@ rcu_torture_init(void) if (firsterr) goto unwind; firsterr = rcu_torture_barrier_init(); + if (firsterr) + goto unwind; + firsterr = rcu_torture_read_exit_init(); if (firsterr) goto unwind; if (object_debug) -- cgit v1.2.3 From cae7cc6ba5bad320c2055ac54f73affd051e76ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 26 Apr 2020 19:20:37 -0700 Subject: rcutorture: NULL rcu_torture_current earlier in cleanup code Currently, the rcu_torture_current variable remains non-NULL until after all readers have stopped. During this time, rcu_torture_stats_print() will think that the test is still ongoing, which can result in confusing dmesg output. This commit therefore NULLs rcu_torture_current immediately after the rcu_torture_writer() kthread has decided to stop, thus informing rcu_torture_stats_print() much sooner. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2621a339c8a4..59112077a6da 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1172,6 +1172,7 @@ rcu_torture_writer(void *arg) WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } } while (!torture_must_stop()); + rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ if (expediting > 0) expediting = -expediting; @@ -2473,7 +2474,6 @@ rcu_torture_cleanup(void) reader_tasks[i]); kfree(reader_tasks); } - rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { -- cgit v1.2.3 From 8f43d5911b38f00dfa46169dcb1feb1e101dd906 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 1 Jun 2020 19:45:48 +0100 Subject: rcu/rcutorture: Replace 0 with false Coccinelle reports a warning WARNING: Assignment of 0/1 to bool variable The root cause is that the variable lastphase is a bool, but is initialised with integer 0. This commit therefore replaces the 0 with a false. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59112077a6da..37455a12898e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2185,7 +2185,7 @@ static void rcu_torture_barrier1cb(void *rcu_void) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; - bool lastphase = 0; + bool lastphase = false; bool newphase; struct rcu_head rcu; -- cgit v1.2.3 From 775227511843202e65a7f194cbf64f38de01f004 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jun 2020 16:43:14 -0700 Subject: rcutorture: Check for unwatched readers RCU is supposed to be watching all non-idle kernel code and also all softirq handlers. This commit adds some teeth to this statement by adding a WARN_ON_ONCE(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 37455a12898e..9c310016585b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1377,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) struct rt_read_seg *rtrsp1; unsigned long long ts; + WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); started = cur_ops->get_gp_seq(); -- cgit v1.2.3 From 2102ad290af06119ccfb56ddc3a0e5011a91537e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 15:38:24 -0700 Subject: torture: Dump ftrace at shutdown only if requested If there is a large number of torture tests running concurrently, all of which are dumping large ftrace buffers at shutdown time, the resulting dumping can take a very long time, particularly on systems with rotating-rust storage. This commit therefore adds a default-off torture.ftrace_dump_at_shutdown module parameter that enables shutdown-time ftrace-buffer dumping. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index a1a41484ff6d..1061492f14bd 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney "); static bool disable_onoff_at_boot; module_param(disable_onoff_at_boot, bool, 0444); +static bool ftrace_dump_at_shutdown; +module_param(ftrace_dump_at_shutdown, bool, 0444); + static char *torture_type; static int verbose; @@ -527,7 +530,8 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - rcu_ftrace_dump(DUMP_ALL); + if (ftrace_dump_at_shutdown) + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } -- cgit v1.2.3 From cda099b37d7165fc73a63961739acf026444cde2 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 19 Feb 2020 11:00:54 -0800 Subject: fork: Annotate a data race in vm_area_dup() struct vm_area_struct could be accessed concurrently as noticed by KCSAN, write to 0xffff9cf8bba08ad8 of 8 bytes by task 14263 on cpu 35: vma_interval_tree_insert+0x101/0x150: rb_insert_augmented_cached at include/linux/rbtree_augmented.h:58 (inlined by) vma_interval_tree_insert at mm/interval_tree.c:23 __vma_link_file+0x6e/0xe0 __vma_link_file at mm/mmap.c:629 vma_link+0xa2/0x120 mmap_region+0x753/0xb90 do_mmap+0x45c/0x710 vm_mmap_pgoff+0xc0/0x130 ksys_mmap_pgoff+0x1d1/0x300 __x64_sys_mmap+0x33/0x40 do_syscall_64+0x91/0xc44 entry_SYSCALL_64_after_hwframe+0x49/0xbe read to 0xffff9cf8bba08a80 of 200 bytes by task 14262 on cpu 122: vm_area_dup+0x6a/0xe0 vm_area_dup at kernel/fork.c:362 __split_vma+0x72/0x2a0 __split_vma at mm/mmap.c:2661 split_vma+0x5a/0x80 mprotect_fixup+0x368/0x3f0 do_mprotect_pkey+0x263/0x420 __x64_sys_mprotect+0x51/0x70 do_syscall_64+0x91/0xc44 entry_SYSCALL_64_after_hwframe+0x49/0xbe vm_area_dup() blindly copies all fields of original VMA to the new one. This includes coping vm_area_struct::shared.rb which is normally protected by i_mmap_lock. But this is fine because the read value will be overwritten on the following __vma_link_file() under proper protection. Thus, mark it as an intentional data race and insert a few assertions for the fields that should not be modified concurrently. Signed-off-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/fork.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..bba10fbcdce7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new) { - *new = *orig; + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; } -- cgit v1.2.3 From 1fe84fd4a4027a17d511a832f89ab14107650ba4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 5 May 2020 20:28:21 +0200 Subject: kcsan: Add test suite This adds KCSAN test focusing on behaviour of the integrated runtime. Tests various race scenarios, and verifies the reports generated to console. Makes use of KUnit for test organization, and the Torture framework for test thread control. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 3 + kernel/kcsan/kcsan-test.c | 1084 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1087 insertions(+) create mode 100644 kernel/kcsan/kcsan-test.c (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index d4999b38d1be..14533cf24bc3 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -12,3 +12,6 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += test.o + +CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c new file mode 100644 index 000000000000..a8c11506dd2a --- /dev/null +++ b/kernel/kcsan/kcsan-test.c @@ -0,0 +1,1084 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN test with various race scenarious to test runtime behaviour. Since the + * interface with which KCSAN's reports are obtained is via the console, this is + * the output we should verify. For each test case checks the presence (or + * absence) of generated reports. Relies on 'console' tracepoint to capture + * reports as they appear in the kernel log. + * + * Makes use of KUnit for test organization, and the Torture framework for test + * thread control. + * + * Copyright (C) 2020, Google LLC. + * Author: Marco Elver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Points to current test-case memory access "kernels". */ +static void (*access_kernels[2])(void); + +static struct task_struct **threads; /* Lists of threads. */ +static unsigned long end_time; /* End time of test. */ + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[3][512]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Setup test checking loop. */ +static __no_kcsan_or_inline void +begin_test_checks(void (*func1)(void), void (*func2)(void)) +{ + kcsan_disable_current(); + + /* + * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at + * least one race is reported. + */ + end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500); + + /* Signal start; release potential initialization of shared data. */ + smp_store_release(&access_kernels[0], func1); + smp_store_release(&access_kernels[1], func2); +} + +/* End test checking loop. */ +static __no_kcsan_or_inline bool +end_test_checks(bool stop) +{ + if (!stop && time_before(jiffies, end_time)) { + /* Continue checking */ + might_sleep(); + return false; + } + + kcsan_enable_current(); + return true; +} + +/* + * Probe for console output: checks if a race was reported, and obtains observed + * lines of interest. + */ +__no_kcsan +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + /* + * Note that KCSAN reports under a global lock, so we do not risk the + * possibility of having multiple reports interleaved. If that were the + * case, we'd expect tests to fail. + */ + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) { + /* + * KCSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + + if (strnstr(buf, "race at unknown origin", len)) { + if (WARN_ON(nlines != 2)) + goto out; + + /* No second line of interest. */ + strcpy(observed.lines[nlines++], ""); + } + } + +out: + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +__no_kcsan +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Report information we expect in a report. */ +struct expect_report { + /* Access information of both accesses. */ + struct { + void *fn; /* Function pointer to expected function of top frame. */ + void *addr; /* Address of access; unchecked if NULL. */ + size_t size; /* Size of access; unchecked if @addr is NULL. */ + int type; /* Access type, see KCSAN_ACCESS definitions. */ + } access[2]; +}; + +/* Check observed report matches information in @r. */ +__no_kcsan +static bool report_matches(const struct expect_report *r) +{ + const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + int i; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", + is_assert ? "assert: race" : "data-race"); + if (r->access[1].fn) { + char tmp[2][64]; + int cmp; + + /* Expect lexographically sorted function names in title. */ + scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn); + scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn); + cmp = strcmp(tmp[0], tmp[1]); + cur += scnprintf(cur, end - cur, "%ps / %ps", + cmp < 0 ? r->access[0].fn : r->access[1].fn, + cmp < 0 ? r->access[1].fn : r->access[0].fn); + } else { + scnprintf(cur, end - cur, "%pS", r->access[0].fn); + /* The exact offset won't match, remove it. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + } + + /* Access 1 */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + if (!r->access[1].fn) + cur += scnprintf(cur, end - cur, "race at unknown origin, with "); + + /* Access 1 & 2 */ + for (i = 0; i < 2; ++i) { + const char *const access_type = + (r->access[i].type & KCSAN_ACCESS_ASSERT) ? + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "write" : + "read"); + const char *const access_type_aux = + (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? + " (scoped)" : + ""); + + if (i == 1) { + /* Access 2 */ + cur = expect[2]; + end = &expect[2][sizeof(expect[2]) - 1]; + + if (!r->access[1].fn) { + /* Dummy string if no second access is available. */ + strcpy(cur, ""); + break; + } + } + + cur += scnprintf(cur, end - cur, "%s%s to ", access_type, + access_type_aux); + + if (r->access[i].addr) /* Address is optional. */ + cur += scnprintf(cur, end - cur, "0x%px of %zu bytes", + r->access[i].addr, r->access[i].size); + } + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && + /* Access info may appear in any order. */ + ((strstr(observed.lines[1], expect[1]) && + strstr(observed.lines[2], expect[2])) || + (strstr(observed.lines[1], expect[2]) && + strstr(observed.lines[2], expect[1]))); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test kernels ===== */ + +static long test_sink; +static long test_var; +/* @test_array should be large enough to fall into multiple watchpoint slots. */ +static long test_array[3 * PAGE_SIZE / sizeof(long)]; +static struct { + long val[8]; +} test_struct; +static DEFINE_SEQLOCK(test_seqlock); + +/* + * Helper to avoid compiler optimizing out reads, and to generate source values + * for writes. + */ +__no_kcsan +static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); } + +static noinline void test_kernel_read(void) { sink_value(test_var); } + +static noinline void test_kernel_write(void) +{ + test_var = READ_ONCE_NOCHECK(test_sink) + 1; +} + +static noinline void test_kernel_write_nochange(void) { test_var = 42; } + +/* Suffixed by value-change exception filter. */ +static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; } + +static noinline void test_kernel_read_atomic(void) +{ + sink_value(READ_ONCE(test_var)); +} + +static noinline void test_kernel_write_atomic(void) +{ + WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); +} + +__no_kcsan +static noinline void test_kernel_write_uninstrumented(void) { test_var++; } + +static noinline void test_kernel_data_race(void) { data_race(test_var++); } + +static noinline void test_kernel_assert_writer(void) +{ + ASSERT_EXCLUSIVE_WRITER(test_var); +} + +static noinline void test_kernel_assert_access(void) +{ + ASSERT_EXCLUSIVE_ACCESS(test_var); +} + +#define TEST_CHANGE_BITS 0xff00ff00 + +static noinline void test_kernel_change_bits(void) +{ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { + /* + * Avoid race of unknown origin for this test, just pretend they + * are atomic. + */ + kcsan_nestable_atomic_begin(); + test_var ^= TEST_CHANGE_BITS; + kcsan_nestable_atomic_end(); + } else + WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_change(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_nochange(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS); +} + +/* To check that scoped assertions do trigger anywhere in scope. */ +static noinline void test_enter_scope(void) +{ + int x = 0; + + /* Unrelated accesses to scoped assert. */ + READ_ONCE(test_sink); + kcsan_check_read(&x, sizeof(x)); +} + +static noinline void test_kernel_assert_writer_scoped(void) +{ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_assert_access_scoped(void) +{ + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_rmw_array(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_array); ++i) + test_array[i]++; +} + +static noinline void test_kernel_write_struct(void) +{ + kcsan_check_write(&test_struct, sizeof(test_struct)); + kcsan_disable_current(); + test_struct.val[3]++; /* induce value change */ + kcsan_enable_current(); +} + +static noinline void test_kernel_write_struct_part(void) +{ + test_struct.val[3] = 42; +} + +static noinline void test_kernel_read_struct_zero_size(void) +{ + kcsan_check_read(&test_struct.val[3], 0); +} + +static noinline void test_kernel_seqlock_reader(void) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&test_seqlock); + sink_value(test_var); + } while (read_seqretry(&test_seqlock, seq)); +} + +static noinline void test_kernel_seqlock_writer(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&test_seqlock, flags); + test_var++; + write_sequnlock_irqrestore(&test_seqlock, flags); +} + +/* ===== Test cases ===== */ + +/* Simple test with normal data race. */ +__no_kcsan +static void test_basic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write, test_kernel_read); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Stress KCSAN with lots of concurrent races on different addresses until + * timeout. + */ +__no_kcsan +static void test_concurrent_races(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + /* NULL will match any address. */ + { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array); + do { + match_expect |= report_matches(&expect); + match_never |= report_matches(&never); + } while (!end_test_checks(false)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */ +__no_kcsan +static void test_novalue_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should + * never apply work. + */ +__no_kcsan +static void test_novalue_change_exception(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that data races of unknown origin are reported. */ +__no_kcsan +static void test_unknown_origin(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { NULL }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + +/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */ +__no_kcsan +static void test_write_write_assume_atomic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write, test_kernel_write); + do { + sink_value(READ_ONCE(test_var)); /* induce value-change */ + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races with writes larger than word-size are always reported, + * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races where only one write is larger than word-size are always + * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct_part(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that races with atomic accesses never result in reports. */ +__no_kcsan +static void test_read_atomic_write_atomic(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that a race with an atomic and plain access result in reports. */ +__no_kcsan +static void test_read_plain_atomic_write(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_write_atomic); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Zero-sized accesses should never cause data race reports. */ +__no_kcsan +static void test_zero_size_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the data_race() macro. */ +__no_kcsan +static void test_data_race(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_data_race, test_kernel_data_race); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access_writer(struct kunit *test) +{ + const struct expect_report expect_access_writer = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + const struct expect_report expect_access_access = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + bool match_expect_access_writer = false; + bool match_expect_access_access = false; + bool match_never = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer); + do { + match_expect_access_writer |= report_matches(&expect_access_writer); + match_expect_access_access |= report_matches(&expect_access_access); + match_never |= report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect_access_writer); + KUNIT_EXPECT_TRUE(test, match_expect_access_access); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_bits_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_change_bits, &test_var, sizeof(test_var), + KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_bits_nochange(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer_scoped(struct kunit *test) +{ + const struct expect_report expect_start = { + .access = { + { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report expect_anywhere = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect_start = false; + bool match_expect_anywhere = false; + + begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange); + do { + match_expect_start |= report_matches(&expect_start); + match_expect_anywhere |= report_matches(&expect_anywhere); + } while (!end_test_checks(match_expect_start && match_expect_anywhere)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_anywhere); +} + +__no_kcsan +static void test_assert_exclusive_access_scoped(struct kunit *test) +{ + const struct expect_report expect_start1 = { + .access = { + { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + const struct expect_report expect_start2 = { + .access = { expect_start1.access[0], expect_start1.access[0] }, + }; + const struct expect_report expect_inscope = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect_start = false; + bool match_expect_inscope = false; + + begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read); + end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */ + do { + match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2); + match_expect_inscope |= report_matches(&expect_inscope); + } while (!end_test_checks(match_expect_start && match_expect_inscope)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_inscope); +} + +/* Test that racing accesses in seqlock critical sections are not reported. */ +__no_kcsan +static void test_seqlock_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Each test case is run with different numbers of threads. Until KUnit supports + * passing arguments for each test case, we encode #threads in the test case + * name (read by get_num_threads()). [The '-' was chosen as a stylistic + * preference to separate test name and #threads.] + * + * The thread counts are chosen to cover potentially interesting boundaries and + * corner cases (range 2-5), and then stress the system with larger counts. + */ +#define KCSAN_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name "-02" }, \ + { .run_case = test_name, .name = #test_name "-03" }, \ + { .run_case = test_name, .name = #test_name "-04" }, \ + { .run_case = test_name, .name = #test_name "-05" }, \ + { .run_case = test_name, .name = #test_name "-08" }, \ + { .run_case = test_name, .name = #test_name "-16" } + +static struct kunit_case kcsan_test_cases[] = { + KCSAN_KUNIT_CASE(test_basic), + KCSAN_KUNIT_CASE(test_concurrent_races), + KCSAN_KUNIT_CASE(test_novalue_change), + KCSAN_KUNIT_CASE(test_novalue_change_exception), + KCSAN_KUNIT_CASE(test_unknown_origin), + KCSAN_KUNIT_CASE(test_write_write_assume_atomic), + KCSAN_KUNIT_CASE(test_write_write_struct), + KCSAN_KUNIT_CASE(test_write_write_struct_part), + KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), + KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_zero_size_access), + KCSAN_KUNIT_CASE(test_data_race), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_access), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_seqlock_noreport), + {}, +}; + +/* ===== End test cases ===== */ + +/* Get number of threads encoded in test name. */ +static bool __no_kcsan +get_num_threads(const char *test, int *nthreads) +{ + int len = strlen(test); + + if (WARN_ON(len < 3)) + return false; + + *nthreads = test[len - 1] - '0'; + *nthreads += (test[len - 2] - '0') * 10; + + if (WARN_ON(*nthreads < 0)) + return false; + + return true; +} + +/* Concurrent accesses from interrupts. */ +__no_kcsan +static void access_thread_timer(struct timer_list *timer) +{ + static atomic_t cnt = ATOMIC_INIT(0); + unsigned int idx; + void (*func)(void); + + idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); +} + +/* The main loop for each thread. */ +__no_kcsan +static int access_thread(void *arg) +{ + struct timer_list timer; + unsigned int cnt = 0; + unsigned int idx; + void (*func)(void); + + timer_setup_on_stack(&timer, access_thread_timer, 0); + do { + might_sleep(); + + if (!timer_pending(&timer)) + mod_timer(&timer, jiffies + 1); + else { + /* Iterate through all kernels. */ + idx = cnt++ % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); + } + } while (!torture_must_stop()); + del_timer_sync(&timer); + destroy_timer_on_stack(&timer); + + torture_kthread_stopping("access_thread"); + return 0; +} + +__no_kcsan +static int test_init(struct kunit *test) +{ + unsigned long flags; + int nthreads; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); ++i) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + if (!torture_init_begin((char *)test->name, 1)) + return -EBUSY; + + if (!get_num_threads(test->name, &nthreads)) + goto err; + + if (WARN_ON(threads)) + goto err; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) { + if (WARN_ON(access_kernels[i])) + goto err; + } + + if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + /* + * Without any preemption, keep 2 CPUs free for other tasks, one + * of which is the main test case function checking for + * completion or failure. + */ + const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const int min_required_cpus = 2 + min_unused_cpus; + + if (num_online_cpus() < min_required_cpus) { + pr_err("%s: too few online CPUs (%u < %d) for test", + test->name, num_online_cpus(), min_required_cpus); + goto err; + } else if (nthreads > num_online_cpus() - min_unused_cpus) { + nthreads = num_online_cpus() - min_unused_cpus; + pr_warn("%s: limiting number of threads to %d\n", + test->name, nthreads); + } + } + + if (nthreads) { + threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), + GFP_KERNEL); + if (WARN_ON(!threads)) + goto err; + + threads[nthreads] = NULL; + for (i = 0; i < nthreads; ++i) { + if (torture_create_kthread(access_thread, NULL, + threads[i])) + goto err; + } + } + + torture_init_end(); + + return 0; + +err: + kfree(threads); + threads = NULL; + torture_init_end(); + return -EINVAL; +} + +__no_kcsan +static void test_exit(struct kunit *test) +{ + struct task_struct **stop_thread; + int i; + + if (torture_cleanup_begin()) + return; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) + WRITE_ONCE(access_kernels[i], NULL); + + if (threads) { + for (stop_thread = threads; *stop_thread; stop_thread++) + torture_stop_kthread(reader_thread, *stop_thread); + + kfree(threads); + threads = NULL; + } + + torture_cleanup_end(); +} + +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan-test", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; + +__no_kcsan +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +__no_kcsan +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kcsan_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kcsan_test_suites); +} + +static void kcsan_test_exit(void) +{ + __kunit_test_suites_exit(kcsan_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kcsan_test_init); +module_exit(kcsan_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Marco Elver "); -- cgit v1.2.3 From 33190b675ce2eacbeb4e75168c05b41110b506ec Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 11 Feb 2020 08:54:15 -0500 Subject: locking/osq_lock: Annotate a data race in osq_lock The prev->next pointer can be accessed concurrently as noticed by KCSAN: write (marked) to 0xffff9d3370dbbe40 of 8 bytes by task 3294 on cpu 107: osq_lock+0x25f/0x350 osq_wait_next at kernel/locking/osq_lock.c:79 (inlined by) osq_lock at kernel/locking/osq_lock.c:185 rwsem_optimistic_spin read to 0xffff9d3370dbbe40 of 8 bytes by task 3398 on cpu 100: osq_lock+0x196/0x350 osq_lock at kernel/locking/osq_lock.c:157 rwsem_optimistic_spin Since the write only stores NULL to prev->next and the read tests if prev->next equals to this_cpu_ptr(&osq_node). Even if the value is shattered, the code is still working correctly. Thus, mark it as an intentional data race using the data_race() macro. Signed-off-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/locking/osq_lock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 1f7734949ac8..1de006ed3aa8 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ for (;;) { - if (prev->next == node && + /* + * cpu_relax() below implies a compiler barrier which would + * prevent this comparison being optimized away. + */ + if (data_race(prev->next) == node && cmpxchg(&prev->next, node, NULL) == node) break; -- cgit v1.2.3 From 2888557f68db334a3839dcc262264a4c436f576b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 2 Jun 2020 16:36:33 +0200 Subject: kcsan: Prefer '__no_kcsan inline' in test Instead of __no_kcsan_or_inline, prefer '__no_kcsan inline' in test -- this is in case we decide to remove __no_kcsan_or_inline. Suggested-by: Peter Zijlstra Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index a8c11506dd2a..3af420ad6ee7 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -43,7 +43,7 @@ static struct { }; /* Setup test checking loop. */ -static __no_kcsan_or_inline void +static __no_kcsan inline void begin_test_checks(void (*func1)(void), void (*func2)(void)) { kcsan_disable_current(); @@ -60,7 +60,7 @@ begin_test_checks(void (*func1)(void), void (*func2)(void)) } /* End test checking loop. */ -static __no_kcsan_or_inline bool +static __no_kcsan inline bool end_test_checks(bool stop) { if (!stop && time_before(jiffies, end_time)) { -- cgit v1.2.3 From 9dd979bae4cf76558ff816abe83283308fb1ae8c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:22 +0200 Subject: kcsan: Silence -Wmissing-prototypes warning with W=1 The functions here should not be forward declared for explicit use elsewhere in the kernel, as they should only be emitted by the compiler due to sanitizer instrumentation. Add forward declarations a line above their definition to shut up warnings in W=1 builds. Link: https://lkml.kernel.org/r/202006060103.jSCpnV1g%lkp@intel.com Reported-by: kernel test robot Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 15f67949d11e..1866bafda4fd 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -754,6 +754,7 @@ EXPORT_SYMBOL(__kcsan_check_access); */ #define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr); \ void __tsan_read##size(void *ptr) \ { \ check_access(ptr, size, 0); \ @@ -762,6 +763,7 @@ EXPORT_SYMBOL(__kcsan_check_access); void __tsan_unaligned_read##size(void *ptr) \ __alias(__tsan_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr); \ void __tsan_write##size(void *ptr) \ { \ check_access(ptr, size, KCSAN_ACCESS_WRITE); \ @@ -777,12 +779,14 @@ DEFINE_TSAN_READ_WRITE(4); DEFINE_TSAN_READ_WRITE(8); DEFINE_TSAN_READ_WRITE(16); +void __tsan_read_range(void *ptr, size_t size); void __tsan_read_range(void *ptr, size_t size) { check_access(ptr, size, 0); } EXPORT_SYMBOL(__tsan_read_range); +void __tsan_write_range(void *ptr, size_t size); void __tsan_write_range(void *ptr, size_t size) { check_access(ptr, size, KCSAN_ACCESS_WRITE); @@ -799,6 +803,7 @@ EXPORT_SYMBOL(__tsan_write_range); * the size-check of compiletime_assert_rwonce_type(). */ #define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \ + void __tsan_volatile_read##size(void *ptr); \ void __tsan_volatile_read##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -811,6 +816,7 @@ EXPORT_SYMBOL(__tsan_write_range); void __tsan_unaligned_volatile_read##size(void *ptr) \ __alias(__tsan_volatile_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \ + void __tsan_volatile_write##size(void *ptr); \ void __tsan_volatile_write##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -836,14 +842,17 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(16); * The below are not required by KCSAN, but can still be emitted by the * compiler. */ +void __tsan_func_entry(void *call_pc); void __tsan_func_entry(void *call_pc) { } EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void); void __tsan_func_exit(void) { } EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void); void __tsan_init(void) { } -- cgit v1.2.3 From acfa087ccf2d2eff46186477f53e4c3ffbdb033d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:23 +0200 Subject: kcsan: Rename test.c to selftest.c Rename 'test.c' to 'selftest.c' to better reflect its purpose (Kconfig variable and code inside already match this). This is to avoid confusion with the test suite module in 'kcsan-test.c'. No functional change. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 2 +- kernel/kcsan/selftest.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kcsan/test.c | 131 ------------------------------------------------ 3 files changed, 132 insertions(+), 132 deletions(-) create mode 100644 kernel/kcsan/selftest.c delete mode 100644 kernel/kcsan/test.c (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 14533cf24bc3..092ce58d2e56 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -11,7 +11,7 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ $(call cc-option,-fno-stack-protector,) obj-y := core.o debugfs.o report.o -obj-$(CONFIG_KCSAN_SELFTEST) += test.o +obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c new file mode 100644 index 000000000000..d26a052d3383 --- /dev/null +++ b/kernel/kcsan/selftest.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "encoding.h" + +#define ITERS_PER_TEST 2000 + +/* Test requirements. */ +static bool test_requires(void) +{ + /* random should be initialized for the below tests */ + return prandom_u32() + prandom_u32() != 0; +} + +/* + * Test watchpoint encode and decode: check that encoding some access's info, + * and then subsequent decode preserves the access's info. + */ +static bool test_encode_decode(void) +{ + int i; + + for (i = 0; i < ITERS_PER_TEST; ++i) { + size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; + bool is_write = !!prandom_u32_max(2); + unsigned long addr; + + prandom_bytes(&addr, sizeof(addr)); + if (WARN_ON(!check_encodable(addr, size))) + return false; + + /* Encode and decode */ + { + const long encoded_watchpoint = + encode_watchpoint(addr, size, is_write); + unsigned long verif_masked_addr; + size_t verif_size; + bool verif_is_write; + + /* Check special watchpoints */ + if (WARN_ON(decode_watchpoint( + INVALID_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(decode_watchpoint( + CONSUMED_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + + /* Check decoding watchpoint returns same data */ + if (WARN_ON(!decode_watchpoint( + encoded_watchpoint, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(verif_masked_addr != + (addr & WATCHPOINT_ADDR_MASK))) + goto fail; + if (WARN_ON(verif_size != size)) + goto fail; + if (WARN_ON(is_write != verif_is_write)) + goto fail; + + continue; +fail: + pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n", + __func__, is_write ? "write" : "read", size, + addr, encoded_watchpoint, + verif_is_write ? "write" : "read", verif_size, + verif_masked_addr); + return false; + } + } + + return true; +} + +/* Test access matching function. */ +static bool test_matching_access(void) +{ + if (WARN_ON(!matching_access(10, 1, 10, 1))) + return false; + if (WARN_ON(!matching_access(10, 2, 11, 1))) + return false; + if (WARN_ON(!matching_access(10, 1, 9, 2))) + return false; + if (WARN_ON(matching_access(10, 1, 11, 1))) + return false; + if (WARN_ON(matching_access(9, 1, 10, 1))) + return false; + + /* + * An access of size 0 could match another access, as demonstrated here. + * Rather than add more comparisons to 'matching_access()', which would + * end up in the fast-path for *all* checks, check_access() simply + * returns for all accesses of size 0. + */ + if (WARN_ON(!matching_access(8, 8, 12, 0))) + return false; + + return true; +} + +static int __init kcsan_selftest(void) +{ + int passed = 0; + int total = 0; + +#define RUN_TEST(do_test) \ + do { \ + ++total; \ + if (do_test()) \ + ++passed; \ + else \ + pr_err("KCSAN selftest: " #do_test " failed"); \ + } while (0) + + RUN_TEST(test_requires); + RUN_TEST(test_encode_decode); + RUN_TEST(test_matching_access); + + pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + if (passed != total) + panic("KCSAN selftests failed"); + return 0; +} +postcore_initcall(kcsan_selftest); diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c deleted file mode 100644 index d26a052d3383..000000000000 --- a/kernel/kcsan/test.c +++ /dev/null @@ -1,131 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include - -#include "encoding.h" - -#define ITERS_PER_TEST 2000 - -/* Test requirements. */ -static bool test_requires(void) -{ - /* random should be initialized for the below tests */ - return prandom_u32() + prandom_u32() != 0; -} - -/* - * Test watchpoint encode and decode: check that encoding some access's info, - * and then subsequent decode preserves the access's info. - */ -static bool test_encode_decode(void) -{ - int i; - - for (i = 0; i < ITERS_PER_TEST; ++i) { - size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; - bool is_write = !!prandom_u32_max(2); - unsigned long addr; - - prandom_bytes(&addr, sizeof(addr)); - if (WARN_ON(!check_encodable(addr, size))) - return false; - - /* Encode and decode */ - { - const long encoded_watchpoint = - encode_watchpoint(addr, size, is_write); - unsigned long verif_masked_addr; - size_t verif_size; - bool verif_is_write; - - /* Check special watchpoints */ - if (WARN_ON(decode_watchpoint( - INVALID_WATCHPOINT, &verif_masked_addr, - &verif_size, &verif_is_write))) - return false; - if (WARN_ON(decode_watchpoint( - CONSUMED_WATCHPOINT, &verif_masked_addr, - &verif_size, &verif_is_write))) - return false; - - /* Check decoding watchpoint returns same data */ - if (WARN_ON(!decode_watchpoint( - encoded_watchpoint, &verif_masked_addr, - &verif_size, &verif_is_write))) - return false; - if (WARN_ON(verif_masked_addr != - (addr & WATCHPOINT_ADDR_MASK))) - goto fail; - if (WARN_ON(verif_size != size)) - goto fail; - if (WARN_ON(is_write != verif_is_write)) - goto fail; - - continue; -fail: - pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n", - __func__, is_write ? "write" : "read", size, - addr, encoded_watchpoint, - verif_is_write ? "write" : "read", verif_size, - verif_masked_addr); - return false; - } - } - - return true; -} - -/* Test access matching function. */ -static bool test_matching_access(void) -{ - if (WARN_ON(!matching_access(10, 1, 10, 1))) - return false; - if (WARN_ON(!matching_access(10, 2, 11, 1))) - return false; - if (WARN_ON(!matching_access(10, 1, 9, 2))) - return false; - if (WARN_ON(matching_access(10, 1, 11, 1))) - return false; - if (WARN_ON(matching_access(9, 1, 10, 1))) - return false; - - /* - * An access of size 0 could match another access, as demonstrated here. - * Rather than add more comparisons to 'matching_access()', which would - * end up in the fast-path for *all* checks, check_access() simply - * returns for all accesses of size 0. - */ - if (WARN_ON(!matching_access(8, 8, 12, 0))) - return false; - - return true; -} - -static int __init kcsan_selftest(void) -{ - int passed = 0; - int total = 0; - -#define RUN_TEST(do_test) \ - do { \ - ++total; \ - if (do_test()) \ - ++passed; \ - else \ - pr_err("KCSAN selftest: " #do_test " failed"); \ - } while (0) - - RUN_TEST(test_requires); - RUN_TEST(test_encode_decode); - RUN_TEST(test_matching_access); - - pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); - if (passed != total) - panic("KCSAN selftests failed"); - return 0; -} -postcore_initcall(kcsan_selftest); -- cgit v1.2.3 From 7e766560e6e2c1cf2782f00e63c31564e4c9f0fe Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:24 +0200 Subject: kcsan: Remove existing special atomic rules Remove existing special atomic rules from kcsan_is_atomic_special() because they are no longer needed. Since we rely on the compiler emitting instrumentation distinguishing volatile accesses, the rules have become redundant. Let's keep kcsan_is_atomic_special() around, so that we have an obvious place to add special rules should the need arise in future. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/atomic.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index be9e625227f3..75fe701f4127 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -3,8 +3,7 @@ #ifndef _KERNEL_KCSAN_ATOMIC_H #define _KERNEL_KCSAN_ATOMIC_H -#include -#include +#include /* * Special rules for certain memory where concurrent conflicting accesses are @@ -13,8 +12,7 @@ */ static bool kcsan_is_atomic_special(const volatile void *ptr) { - /* volatile globals that have been observed in data races. */ - return ptr == &jiffies || ptr == ¤t->state; + return false; } #endif /* _KERNEL_KCSAN_ATOMIC_H */ -- cgit v1.2.3 From 56b031f0abf55254d47a329010574733fa9a27b8 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:25 +0200 Subject: kcsan: Add jiffies test to test suite Add a test that KCSAN nor the compiler gets confused about accesses to jiffies on different architectures. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index 3af420ad6ee7..fed6fcb5768c 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -366,6 +366,11 @@ static noinline void test_kernel_read_struct_zero_size(void) kcsan_check_read(&test_struct.val[3], 0); } +static noinline void test_kernel_jiffies_reader(void) +{ + sink_value((long)jiffies); +} + static noinline void test_kernel_seqlock_reader(void) { unsigned int seq; @@ -817,6 +822,23 @@ static void test_assert_exclusive_access_scoped(struct kunit *test) KUNIT_EXPECT_TRUE(test, match_expect_inscope); } +/* + * jiffies is special (declared to be volatile) and its accesses are typically + * not marked; this test ensures that the compiler nor KCSAN gets confused about + * jiffies's declaration on different architectures. + */ +__no_kcsan +static void test_jiffies_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + /* Test that racing accesses in seqlock critical sections are not reported. */ __no_kcsan static void test_seqlock_noreport(struct kunit *test) @@ -867,6 +889,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), {}, }; -- cgit v1.2.3 From 2839a232071f588d334543fb86f5689b43353842 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 18 Jun 2020 11:31:17 +0200 Subject: kcsan: Simplify compiler flags Simplify the set of compiler flags for the runtime by removing cc-option from -fno-stack-protector, because all supported compilers support it. This saves us one compiler invocation during build. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 092ce58d2e56..fea064afc4f7 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -7,8 +7,8 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) -CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ - $(call cc-option,-fno-stack-protector,) +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + -fno-stack-protector obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o -- cgit v1.2.3 From 61d56d7aa5eca3b909bce51ba8125b0fa44d7e17 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 18 Jun 2020 11:31:18 +0200 Subject: kcsan: Disable branch tracing in core runtime Disable branch tracing in core KCSAN runtime if branches are being traced (TRACE_BRANCH_PROFILING). This it to avoid its performance impact, but also avoid recursion in case KCSAN is enabled for the branch tracing runtime. The latter had already been a problem for KASAN: https://lore.kernel.org/lkml/CANpmjNOeXmD5E3O50Z3MjkiuCYaYOPyi+1rq=GZvEKwBvLR0Ug@mail.gmail.com/ Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index fea064afc4f7..65ca5539c470 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -8,7 +8,7 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ - -fno-stack-protector + -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o -- cgit v1.2.3 From 142240398e50e5fe3171bcf2459856603be13a39 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sat, 27 Jun 2020 23:24:19 -0400 Subject: audit: add gfp parameter to audit_log_nfcfg Fixed an inconsistent use of GFP flags in nft_obj_notify() that used GFP_KERNEL when a GFP flag was passed in to that function. Given this allocated memory was then used in audit_log_nfcfg() it led to an audit of all other GFP allocations in net/netfilter/nf_tables_api.c and a modification of audit_log_nfcfg() to accept a GFP parameter. Reported-by: Dan Carptenter Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3a9100e95fda..eae1a599ffe3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2572,12 +2572,12 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, - enum audit_nfcfgop op) + enum audit_nfcfgop op, gfp_t gfp) { struct audit_buffer *ab; char comm[sizeof(current->comm)]; - ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); + ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG); if (!ab) return; audit_log_format(ab, "table=%s family=%u entries=%u op=%s", -- cgit v1.2.3 From 3aa91625007807bfca4155df1867a5c924a08662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:56 +0200 Subject: dma-mapping: Add a new dma_need_sync API Add a new API to check if calls to dma_sync_single_for_{device,cpu} are required for a given DMA streaming mapping. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-2-hch@lst.de --- kernel/dma/direct.c | 6 ++++++ kernel/dma/mapping.c | 10 ++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 0a4881e59aa7..ecb922a0bfa0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -530,3 +530,9 @@ size_t dma_direct_max_mapping_size(struct device *dev) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } + +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return !dev_is_dma_coherent(dev) || + is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 98e3d873792e..a8c18c9a796f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -397,6 +397,16 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_is_direct(ops)) + return dma_direct_need_sync(dev, dma_addr); + return ops->sync_single_for_cpu || ops->sync_single_for_device; +} +EXPORT_SYMBOL_GPL(dma_need_sync); + unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); -- cgit v1.2.3 From 517bbe1994a3cee29a35c730662277bb5daff582 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jun 2020 23:15:00 -0700 Subject: bpf: Enforce BPF ringbuf size to be the power of 2 BPF ringbuf assumes the size to be a multiple of page size and the power of 2 value. The latter is important to avoid division while calculating position inside the ring buffer and using (N-1) mask instead. This patch fixes omission to enforce power-of-2 size rule. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200630061500.1804799-1-andriin@fb.com --- kernel/bpf/ringbuf.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 180414bb0d3e..0af88bbc1c15 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -132,15 +132,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; - if (!data_sz || !PAGE_ALIGNED(data_sz)) - return ERR_PTR(-EINVAL); - -#ifdef CONFIG_64BIT - /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ - if (data_sz > RINGBUF_MAX_DATA_SZ) - return ERR_PTR(-E2BIG); -#endif - rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) return ERR_PTR(-ENOMEM); @@ -166,9 +157,16 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-EINVAL); if (attr->key_size || attr->value_size || - attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + !is_power_of_2(attr->max_entries) || + !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (attr->max_entries > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); if (!rb_map) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 3b7016996c4c44db5d499d98759b82fb714bb912 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:54 +0200 Subject: flow_dissector: Pull BPF program assignment up to bpf-netns Prepare for using bpf_prog_array to store attached programs by moving out code that updates the attached program out of flow dissector. Managing bpf_prog_array is more involved than updating a single bpf_prog pointer. This will let us do it all from one place, bpf/net_namespace.c, in the subsequent patch. No functional change intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-2-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 78cf061f8179..b951dab2687f 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -189,6 +189,7 @@ int netns_bpf_prog_query(const union bpf_attr *attr, int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { enum netns_bpf_attach_type type; + struct bpf_prog *attached; struct net *net; int ret; @@ -207,12 +208,26 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - ret = flow_dissector_bpf_prog_attach(net, prog); + ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } + if (ret) + goto out_unlock; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out_unlock; + } + rcu_assign_pointer(net->bpf.progs[type], prog); + if (attached) + bpf_prog_put(attached); + out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -277,7 +292,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - err = flow_dissector_bpf_prog_attach(net, link->prog); + err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; default: err = -EINVAL; @@ -286,6 +301,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; + rcu_assign_pointer(net->bpf.progs[type], link->prog); net->bpf.links[type] = link; out_unlock: -- cgit v1.2.3 From 695c12147a40181fe9221d321c3f2de33c9574ed Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:55 +0200 Subject: bpf, netns: Keep attached programs in bpf_prog_array Prepare for having multi-prog attachments for new netns attach types by storing programs to run in a bpf_prog_array, which is well suited for iterating over programs and running them in sequence. After this change bpf(PROG_QUERY) may block to allocate memory in bpf_prog_array_copy_to_user() for collected program IDs. This forces a change in how we protect access to the attached program in the query callback. Because bpf_prog_array_copy_to_user() can sleep, we switch from an RCU read lock to holding a mutex that serializes updaters. Because we allow only one BPF flow_dissector program to be attached to netns at all times, the bpf_prog_array pointed by net->bpf.run_array is always either detached (null) or one element long. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200625141357.910330-3-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 120 +++++++++++++++++++++++++++++++-------------- 1 file changed, 82 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b951dab2687f..0dba97202357 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -33,6 +33,17 @@ static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) net_link->net = NULL; } +/* Must be called with netns_bpf_mutex held. */ +static void netns_bpf_run_array_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog_array *run_array; + + run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); +} + static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -54,8 +65,8 @@ static void bpf_netns_link_release(struct bpf_link *link) if (!net) goto out_unlock; + netns_bpf_run_array_detach(net, type); net->bpf.links[type] = NULL; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -76,6 +87,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *run_array; struct net *net; int ret = 0; @@ -93,8 +105,11 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, goto out_unlock; } + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + WRITE_ONCE(run_array->items[0].prog, new_prog); + old_prog = xchg(&link->prog, new_prog); - rcu_assign_pointer(net->bpf.progs[type], new_prog); bpf_prog_put(old_prog); out_unlock: @@ -142,14 +157,38 @@ static const struct bpf_link_ops bpf_netns_link_ops = { .show_fdinfo = bpf_netns_link_show_fdinfo, }; +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct net *net, + enum netns_bpf_attach_type type) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + struct bpf_prog_array *run_array; + u32 prog_cnt = 0, flags = 0; + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) + prog_cnt = bpf_prog_array_length(run_array); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + return bpf_prog_array_copy_to_user(run_array, prog_ids, + attr->query.prog_cnt); +} + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; enum netns_bpf_attach_type type; - struct bpf_prog *attached; struct net *net; + int ret; if (attr->query.query_flags) return -EINVAL; @@ -162,32 +201,17 @@ int netns_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(net)) return PTR_ERR(net); - rcu_read_lock(); - attached = rcu_dereference(net->bpf.progs[type]); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_query(attr, uattr, net, type); + mutex_unlock(&netns_bpf_mutex); put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; + return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; @@ -217,14 +241,28 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (ret) goto out_unlock; - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); + attached = net->bpf.progs[type]; if (attached == prog) { /* The same program cannot be attached twice */ ret = -EINVAL; goto out_unlock; } - rcu_assign_pointer(net->bpf.progs[type], prog); + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) { + WRITE_ONCE(run_array->items[0].prog, prog); + } else { + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + ret = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + } + + net->bpf.progs[type] = prog; if (attached) bpf_prog_put(attached); @@ -244,11 +282,11 @@ static int __netns_bpf_prog_detach(struct net *net, if (net->bpf.links[type]) return -EINVAL; - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); + attached = net->bpf.progs[type]; if (!attached) return -ENOENT; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); + netns_bpf_run_array_detach(net, type); + net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } @@ -272,7 +310,7 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { - struct bpf_prog *prog; + struct bpf_prog_array *run_array; int err; mutex_lock(&netns_bpf_mutex); @@ -283,9 +321,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, goto out_unlock; } /* Links are not compatible with attaching prog directly */ - prog = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (prog) { + if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } @@ -301,7 +337,14 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; - rcu_assign_pointer(net->bpf.progs[type], link->prog); + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + err = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = link->prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + net->bpf.links[type] = link; out_unlock: @@ -368,11 +411,12 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + netns_bpf_run_array_detach(net, type); link = net->bpf.links[type]; if (link) bpf_netns_link_auto_detach(link); - else - __netns_bpf_prog_detach(net, type); + else if (net->bpf.progs[type]) + bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } -- cgit v1.2.3 From ab53cad90eb10c9991f501ba08904680a074ef3d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:56 +0200 Subject: bpf, netns: Keep a list of attached bpf_link's To support multi-prog link-based attachments for new netns attach types, we need to keep track of more than one bpf_link per attach type. Hence, convert net->bpf.links into a list, that currently can be either empty or have just one item. Instead of reusing bpf_prog_list from bpf-cgroup, we link together bpf_netns_link's themselves. This makes list management simpler as we don't have to allocate, initialize, and later release list elements. We can do this because multi-prog attachment will be available only for bpf_link, and we don't need to build a list of programs attached directly and indirectly via links. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-4-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 0dba97202357..7a34a8caf954 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -19,20 +19,12 @@ struct bpf_netns_link { * with netns_bpf_mutex held. */ struct net *net; + struct list_head node; /* node in list of links attached to net */ }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); -/* Must be called with netns_bpf_mutex held. */ -static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) -{ - struct bpf_netns_link *net_link = - container_of(link, struct bpf_netns_link, link); - - net_link->net = NULL; -} - /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) @@ -66,7 +58,7 @@ static void bpf_netns_link_release(struct bpf_link *link) goto out_unlock; netns_bpf_run_array_detach(net, type); - net->bpf.links[type] = NULL; + list_del(&net_link->node); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -225,7 +217,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } @@ -279,7 +271,7 @@ static int __netns_bpf_prog_detach(struct net *net, struct bpf_prog *attached; /* Progs attached via links cannot be detached */ - if (net->bpf.links[type]) + if (!list_empty(&net->bpf.links[type])) return -EINVAL; attached = net->bpf.progs[type]; @@ -310,13 +302,15 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; int err; mutex_lock(&netns_bpf_mutex); /* Allow attaching only one prog or link for now */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { err = -E2BIG; goto out_unlock; } @@ -345,7 +339,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, run_array->items[0].prog = link->prog; rcu_assign_pointer(net->bpf.run_array[type], run_array); - net->bpf.links[type] = link; + list_add_tail(&net_link->node, &net->bpf.links[type]); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -404,24 +398,34 @@ out_put_net: return err; } +static int __net_init netns_bpf_pernet_init(struct net *net) +{ + int type; + + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + INIT_LIST_HEAD(&net->bpf.links[type]); + + return 0; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; - struct bpf_link *link; + struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); - link = net->bpf.links[type]; - if (link) - bpf_netns_link_auto_detach(link); - else if (net->bpf.progs[type]) + list_for_each_entry(net_link, &net->bpf.links[type], node) + net_link->net = NULL; /* auto-detach link */ + if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; -- cgit v1.2.3 From 1b514239e85965cc4df085180a73dd91733135f7 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:25 +0100 Subject: bpf: flow_dissector: Check value of unused flags to BPF_PROG_ATTACH Using BPF_PROG_ATTACH on a flow dissector program supports neither target_fd, attach_flags or replace_bpf_fd but accepts any value. Enforce that all of them are zero. This is fine for replace_bpf_fd since its presence is indicated by BPF_F_REPLACE. It's more problematic for target_fd, since zero is a valid fd. Should we want to use the flag later on we'd have to add an exception for fd 0. The alternative is to force a value like -1. This requires more changes to tests. There is also precedent for using 0, since bpf_iter uses this for target_fd as well. Fixes: b27f7bb590ba ("flow_dissector: Move out netns_bpf prog callbacks") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-2-lmb@cloudflare.com --- kernel/bpf/net_namespace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 7a34a8caf954..03045f45afec 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -209,6 +209,9 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) struct net *net; int ret; + if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; -- cgit v1.2.3 From 4ac2add65974e4efafb8d4ccd8fc5660417ea312 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:26 +0100 Subject: bpf: flow_dissector: Check value of unused flags to BPF_PROG_DETACH Using BPF_PROG_DETACH on a flow dissector program supports neither attach_flags nor attach_bpf_fd. Yet no value is enforced for them. Enforce that attach_flags are zero, and require the current program to be passed via attach_bpf_fd. This allows us to remove the check for CAP_SYS_ADMIN, since userspace can now no longer remove arbitrary flow dissector programs. Fixes: b27f7bb590ba ("flow_dissector: Move out netns_bpf prog callbacks") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-3-lmb@cloudflare.com --- kernel/bpf/net_namespace.c | 19 +++++++++++++++---- kernel/bpf/syscall.c | 4 +--- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 03045f45afec..3dbc29b6f51d 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -269,7 +269,8 @@ out_unlock: /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) + enum netns_bpf_attach_type type, + struct bpf_prog *old) { struct bpf_prog *attached; @@ -278,7 +279,7 @@ static int __netns_bpf_prog_detach(struct net *net, return -EINVAL; attached = net->bpf.progs[type]; - if (!attached) + if (!attached || attached != old) return -ENOENT; netns_bpf_run_array_detach(net, type); net->bpf.progs[type] = NULL; @@ -286,19 +287,29 @@ static int __netns_bpf_prog_detach(struct net *net, return 0; } -int netns_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; + struct bpf_prog *prog; int ret; + if (attr->target_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); + bpf_prog_put(prog); + return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d946435587d..28c6ef759037 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2897,9 +2897,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - return netns_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr, ptype); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: -- cgit v1.2.3 From bb0de3131f4c60a9bf976681e0fe4d1e55c7a821 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:28 +0100 Subject: bpf: sockmap: Require attach_bpf_fd when detaching a program The sockmap code currently ignores the value of attach_bpf_fd when detaching a program. This is contrary to the usual behaviour of checking that attach_bpf_fd represents the currently attached program. Ensure that attach_bpf_fd is indeed the currently attached program. It turns out that all sockmap selftests already do this, which indicates that this is unlikely to cause breakage. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-5-lmb@cloudflare.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 28c6ef759037..a74fce8ce043 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2893,7 +2893,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, ptype); case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: -- cgit v1.2.3 From 2576f87066dc08a11cb1c05f11d1eaa02148ef9e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 30 Jun 2020 18:45:41 +0200 Subject: bpf, netns: Fix use-after-free in pernet pre_exit callback Iterating over BPF links attached to network namespace in pre_exit hook is not safe, even if there is just one. Once link gets auto-detached, that is its back-pointer to net object is set to NULL, the link can be released and freed without waiting on netns_bpf_mutex, effectively causing the list element we are operating on to be freed. This leads to use-after-free when trying to access the next element on the list, as reported by KASAN. Bug can be triggered by destroying a network namespace, while also releasing a link attached to this network namespace. | ================================================================== | BUG: KASAN: use-after-free in netns_bpf_pernet_pre_exit+0xd9/0x130 | Read of size 8 at addr ffff888119e0d778 by task kworker/u8:2/177 | | CPU: 3 PID: 177 Comm: kworker/u8:2 Not tainted 5.8.0-rc1-00197-ga0c04c9d1008-dirty #776 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: netns cleanup_net | Call Trace: | dump_stack+0x9e/0xe0 | print_address_description.constprop.0+0x3a/0x60 | ? netns_bpf_pernet_pre_exit+0xd9/0x130 | kasan_report.cold+0x1f/0x40 | ? netns_bpf_pernet_pre_exit+0xd9/0x130 | netns_bpf_pernet_pre_exit+0xd9/0x130 | cleanup_net+0x30b/0x5b0 | ? unregister_pernet_device+0x50/0x50 | ? rcu_read_lock_bh_held+0xb0/0xb0 | ? _raw_spin_unlock_irq+0x24/0x50 | process_one_work+0x4d1/0xa10 | ? lock_release+0x3e0/0x3e0 | ? pwq_dec_nr_in_flight+0x110/0x110 | ? rwlock_bug.part.0+0x60/0x60 | worker_thread+0x7a/0x5c0 | ? process_one_work+0xa10/0xa10 | kthread+0x1e3/0x240 | ? kthread_create_on_node+0xd0/0xd0 | ret_from_fork+0x1f/0x30 | | Allocated by task 280: | save_stack+0x1b/0x40 | __kasan_kmalloc.constprop.0+0xc2/0xd0 | netns_bpf_link_create+0xfe/0x650 | __do_sys_bpf+0x153a/0x2a50 | do_syscall_64+0x59/0x300 | entry_SYSCALL_64_after_hwframe+0x44/0xa9 | | Freed by task 198: | save_stack+0x1b/0x40 | __kasan_slab_free+0x12f/0x180 | kfree+0xed/0x350 | process_one_work+0x4d1/0xa10 | worker_thread+0x7a/0x5c0 | kthread+0x1e3/0x240 | ret_from_fork+0x1f/0x30 | | The buggy address belongs to the object at ffff888119e0d700 | which belongs to the cache kmalloc-192 of size 192 | The buggy address is located 120 bytes inside of | 192-byte region [ffff888119e0d700, ffff888119e0d7c0) | The buggy address belongs to the page: | page:ffffea0004678340 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 | flags: 0x2fffe0000000200(slab) | raw: 02fffe0000000200 ffffea00045ba8c0 0000000600000006 ffff88811a80ea80 | raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 | page dumped because: kasan: bad access detected | | Memory state around the buggy address: | ffff888119e0d600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb | ffff888119e0d680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc | >ffff888119e0d700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb | ^ | ffff888119e0d780: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc | ffff888119e0d800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb | ================================================================== Remove the "fast-path" for releasing a link that got auto-detached by a dying network namespace to fix it. This way as long as link is on the list and netns_bpf mutex is held, we have a guarantee that link memory can be accessed. An alternative way to fix this issue would be to safely iterate over the list of links and ensure there is no access to link object after detaching it. But, at the moment, optimizing synchronization overhead on link release without a workload in mind seems like an overkill. Fixes: ab53cad90eb1 ("bpf, netns: Keep a list of attached bpf_link's") Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200630164541.1329993-1-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 3dbc29b6f51d..310241ca7991 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -43,15 +43,11 @@ static void bpf_netns_link_release(struct bpf_link *link) enum netns_bpf_attach_type type = net_link->netns_type; struct net *net; - /* Link auto-detached by dying netns. */ - if (!net_link->net) - return; - mutex_lock(&netns_bpf_mutex); - /* Recheck after potential sleep. We can race with cleanup_net - * here, but if we see a non-NULL struct net pointer pre_exit - * has not happened yet and will block on netns_bpf_mutex. + /* We can race with cleanup_net, but if we see a non-NULL + * struct net pointer, pre_exit has not run yet and wait for + * netns_bpf_mutex. */ net = net_link->net; if (!net) -- cgit v1.2.3 From e91b48162332480f5840902268108bb7fb7a44c7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Jun 2020 17:32:54 +0200 Subject: task_work: teach task_work_add() to do signal_wake_up() So that the target task will exit the wait_event_interruptible-like loop and call task_work_run() asap. The patch turns "bool notify" into 0,TWA_RESUME,TWA_SIGNAL enum, the new TWA_SIGNAL flag implies signal_wake_up(). However, it needs to avoid the race with recalc_sigpending(), so the patch also adds the new JOBCTL_TASK_WORK bit included in JOBCTL_PENDING_MASK. TODO: once this patch is merged we need to change all current users of task_work_add(notify = true) to use TWA_RESUME. Cc: stable@vger.kernel.org # v5.7 Acked-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- kernel/signal.c | 10 +++++++--- kernel/task_work.c | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5ca48cc5da76..ee22ec78fd6d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2529,9 +2529,6 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) - task_work_run(); - if (unlikely(uprobe_deny_signal())) return false; @@ -2544,6 +2541,13 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + if (unlikely(current->task_works)) { + spin_unlock_irq(&sighand->siglock); + task_work_run(); + goto relock; + } + /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes diff --git a/kernel/task_work.c b/kernel/task_work.c index 825f28259a19..5c0848ca1287 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -25,9 +25,10 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * 0 if succeeds or -ESRCH. */ int -task_work_add(struct task_struct *task, struct callback_head *work, bool notify) +task_work_add(struct task_struct *task, struct callback_head *work, int notify) { struct callback_head *head; + unsigned long flags; do { head = READ_ONCE(task->task_works); @@ -36,8 +37,19 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) work->next = head; } while (cmpxchg(&task->task_works, head, work) != head); - if (notify) + switch (notify) { + case TWA_RESUME: set_notify_resume(task); + break; + case TWA_SIGNAL: + if (lock_task_sighand(task, &flags)) { + task->jobctl |= JOBCTL_TASK_WORK; + signal_wake_up(task, 0); + unlock_task_sighand(task, &flags); + } + break; + } + return 0; } -- cgit v1.2.3 From 7ef282e05132d56b6f6b71e3873f317664bea78b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Jun 2020 23:45:56 -0400 Subject: tracing: Move pipe reference to trace array instead of current_tracer If a process has the trace_pipe open on a trace_array, the current tracer for that trace array should not be changed. This was original enforced by a global lock, but when instances were introduced, it was moved to the current_trace. But this structure is shared by all instances, and a trace_pipe is for a single instance. There's no reason that a process that has trace_pipe open on one instance should prevent another instance from changing its current tracer. Move the reference counter to the trace_array instead. This is marked as "Fixes" but is more of a clean up than a true fix. Backport if you want, but its not critical. Fixes: cf6ab6d9143b1 ("tracing: Add ref count to tracer for when they are being read by pipe") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++++------ kernel/trace/trace.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8241d1448d70..64c5b8146cca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5891,7 +5891,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) } /* If trace pipe files are being read, we can't change the tracer */ - if (tr->current_trace->ref) { + if (tr->trace_ref) { ret = -EBUSY; goto out; } @@ -6107,7 +6107,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); - tr->current_trace->ref++; + tr->trace_ref++; out: mutex_unlock(&trace_types_lock); return ret; @@ -6126,7 +6126,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - tr->current_trace->ref--; + tr->trace_ref--; if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -7428,7 +7428,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) filp->private_data = info; - tr->current_trace->ref++; + tr->trace_ref++; mutex_unlock(&trace_types_lock); @@ -7529,7 +7529,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - iter->tr->current_trace->ref--; + iter->tr->trace_ref--; __trace_array_put(iter->tr); @@ -8737,7 +8737,7 @@ static int __remove_instance(struct trace_array *tr) int i; /* Reference counter for a newly created trace array = 1. */ - if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) + if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) return -EBUSY; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13db4000af3f..f21607f87189 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -356,6 +356,7 @@ struct trace_array { struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; + int trace_ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; @@ -547,7 +548,6 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; int enabled; - int ref; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE -- cgit v1.2.3 From a389d86f7fd0902e4ce4136a5601988dbd371eb1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 28 Jun 2020 22:52:25 -0400 Subject: ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 276 +++++++++++++++++++++++++++++---------------- 1 file changed, 181 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 00867ff82412..026238c55b0c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,6 +27,7 @@ #include #include +#include static void update_pages_handler(struct work_struct *work); @@ -418,6 +419,17 @@ struct rb_event_info { int add_timestamp; }; +/* + * Used for the add_timestamp + * NONE + * NORMAL - may be for either time extend or absolute + * FORCE - force a full time stamp. + */ +enum { + RB_ADD_STAMP_NONE, + RB_ADD_STAMP_NORMAL, + RB_ADD_STAMP_FORCE +}; /* * Used for which event context the event is in. * NMI = 0 @@ -470,7 +482,8 @@ struct ring_buffer_per_cpu { size_t shortest_full; unsigned long read; unsigned long read_bytes; - u64 write_stamp; + local64_t write_stamp; + local64_t before_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -2416,16 +2429,13 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned length = info->length; u64 delta = info->delta; - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - /* * If we need to add a timestamp, then we * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) { - bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); + bool abs = info->add_timestamp == RB_ADD_STAMP_FORCE || + ring_buffer_time_stamp_abs(cpu_buffer->buffer); event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); length -= RB_LEN_TIME_EXTEND; @@ -2480,6 +2490,39 @@ static inline bool sched_clock_stable(void) } #endif +static __always_inline bool +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static u64 rb_time_delta(struct ring_buffer_event *event) +{ + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return 0; + + case RINGBUF_TYPE_TIME_EXTEND: + return ring_buffer_event_time_stamp(event); + + case RINGBUF_TYPE_TIME_STAMP: + return 0; + + case RINGBUF_TYPE_DATA: + return event->time_delta; + default: + return 0; + } +} + static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) @@ -2488,6 +2531,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage; unsigned long index; unsigned long addr; + u64 write_stamp; + u64 delta; new_index = rb_event_index(event); old_index = new_index + rb_event_ts_length(event); @@ -2496,10 +2541,32 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = READ_ONCE(cpu_buffer->tail_page); + delta = rb_time_delta(event); + + write_stamp = local64_read(&cpu_buffer->write_stamp); + + /* Make sure the write stamp is read before testing the location */ + barrier(); + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); + u64 ret; + + ret = local64_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta); + /* Something came in, can't discard */ + if (ret != write_stamp) + return 0; + + /* + * If an event were to come in now, it would see that the + * write_stamp and the before_stamp are different, and assume + * that this event just added itself before updating + * the write stamp. The interrupting event will fix the + * write stamp for us, and use the before stamp as its delta. + */ + /* * This is on the tail page. It is possible that * a write could come in and move the tail page @@ -2551,10 +2618,6 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - /* Only update the write stamp if the page has an event */ - if (rb_page_write(cpu_buffer->commit_page)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -2626,54 +2689,10 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static __always_inline bool -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static __always_inline void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = ring_buffer_event_time_stamp(event); - cpu_buffer->write_stamp += delta; - } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { - delta = ring_buffer_event_time_stamp(event); - cpu_buffer->write_stamp = delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); rb_end_commit(cpu_buffer); } @@ -2872,13 +2891,13 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, - (unsigned long long)cpu_buffer->write_stamp, + (unsigned long long)local64_read(&cpu_buffer->write_stamp), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); - info->add_timestamp = 1; + info->add_timestamp = RB_ADD_STAMP_NORMAL; } static struct ring_buffer_event * @@ -2887,8 +2906,36 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, { struct ring_buffer_event *event; struct buffer_page *tail_page; - unsigned long tail, write; + unsigned long tail, write, w; + u64 before, after; + bool abs = false; + + /* Don't let the compiler play games with cpu_buffer->tail_page */ + tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); + + /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; + barrier(); + before = local64_read(&cpu_buffer->before_stamp); + after = local64_read(&cpu_buffer->write_stamp); + barrier(); + info->ts = rb_time_stamp(cpu_buffer->buffer); + + if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { + info->delta = info->ts; + abs = true; + } else { + info->delta = info->ts - after; + } + + if (unlikely(test_time_stamp(info->delta))) + rb_handle_timestamp(cpu_buffer, info); + /* + * If interrupting an event time update, we may need an absolute timestamp. + * Don't bother if this is the start of a new page (w == 0). + */ + if (unlikely(before != after && w)) + info->add_timestamp = RB_ADD_STAMP_FORCE; /* * If the time delta since the last event is too big to * hold in the time field of the event, then we append a @@ -2897,25 +2944,88 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - /* Don't let the compiler play games with cpu_buffer->tail_page */ - tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); - write = local_add_return(info->length, &tail_page->write); + /*B*/ local64_set(&cpu_buffer->before_stamp, info->ts); + + /*C*/ write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; + tail = write - info->length; + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) { + if (tail != w) { + /* before and after may now different, fix it up*/ + before = local64_read(&cpu_buffer->before_stamp); + after = local64_read(&cpu_buffer->write_stamp); + if (before != after) + (void)local64_cmpxchg(&cpu_buffer->before_stamp, before, after); + } + return rb_move_tail(cpu_buffer, tail, info); + } + + if (likely(tail == w)) { + u64 save_before; + + /* Nothing interrupted us between A and C */ + /*D*/ local64_set(&cpu_buffer->write_stamp, info->ts); + barrier(); + /*E*/ save_before = local64_read(&cpu_buffer->before_stamp); + if (likely(info->add_timestamp != RB_ADD_STAMP_FORCE)) + /* This did not interrupt any time update */ + info->delta = info->ts - after; + else + /* Just use full timestamp for inerrupting event */ + info->delta = info->ts; + barrier(); + if (unlikely(info->ts != save_before)) { + /* SLOW PATH - Interrupted between C and E */ + + after = local64_read(&cpu_buffer->write_stamp); + /* Write stamp must only go forward */ + if (save_before > after) { + /* + * We do not care about the result, only that + * it gets updated atomically. + */ + (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, save_before); + } + } + } else { + u64 ts; + /* SLOW PATH - Interrupted between A and C */ + after = local64_read(&cpu_buffer->write_stamp); + ts = rb_time_stamp(cpu_buffer->buffer); + barrier(); + /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && + after < ts) { + /* Nothing came after this event between C and E */ + info->delta = ts - after; + (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); + info->ts = ts; + } else { + /* + * Interrupted beween C and E: + * Lost the previous events time stamp. Just set the + * delta to zero, and this will be the same time as + * the event this event interrupted. And the events that + * came after this will still be correct (as they would + * have built their delta on the previous event. + */ + info->delta = 0; + } + if (info->add_timestamp == RB_ADD_STAMP_FORCE) + info->add_timestamp = RB_ADD_STAMP_NORMAL; + } + /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer)) + if (unlikely(!tail && info->add_timestamp != RB_ADD_STAMP_FORCE && !abs)) info->delta = 0; - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, tail, info); - /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); @@ -2944,9 +3054,9 @@ rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; - u64 diff; rb_start_commit(cpu_buffer); + /* The commit page can not change after this */ #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* @@ -2965,7 +3075,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, info.length = rb_calculate_event_length(length); again: - info.add_timestamp = 0; + info.add_timestamp = RB_ADD_STAMP_NONE; info.delta = 0; /* @@ -2980,22 +3090,6 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - info.ts = rb_time_stamp(cpu_buffer->buffer); - diff = info.ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - if (ring_buffer_time_stamp_abs(buffer)) { - info.delta = info.ts; - rb_handle_timestamp(cpu_buffer, &info); - } else /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { - info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) - rb_handle_timestamp(cpu_buffer, &info); - } - event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { @@ -3004,11 +3098,8 @@ rb_reserve_next_event(struct trace_buffer *buffer, goto again; } - if (!event) - goto out_fail; - - return event; - + if (likely(event)) + return event; out_fail: rb_end_commit(cpu_buffer); return NULL; @@ -3154,11 +3245,6 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, if (rb_try_to_discard(cpu_buffer, event)) goto out; - /* - * The commit is still visible by the reader, so we - * must still update the timestamp. - */ - rb_update_write_stamp(cpu_buffer, event); out: rb_end_commit(cpu_buffer); @@ -4475,8 +4561,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; - cpu_buffer->write_stamp = 0; - cpu_buffer->read_stamp = 0; + local64_set(&cpu_buffer->write_stamp, 0); + local64_set(&cpu_buffer->before_stamp, 0); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; -- cgit v1.2.3 From 7c4b4a5164fbedc11c23e3671bd90ba0d23a5efd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 28 Jun 2020 22:52:26 -0400 Subject: ring-buffer: Incorporate absolute timestamp into add_timestamp logic Instead of calling out the absolute test for each time to check if the ring buffer wants absolute time stamps for all its recording, incorporate it with the add_timestamp field and turn it into flags for faster processing between wanting a absolute tag and needing to force one. Link: http://lkml.kernel.org/r/20200629025259.154892368@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 026238c55b0c..7ee6619951ea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -422,13 +422,15 @@ struct rb_event_info { /* * Used for the add_timestamp * NONE - * NORMAL - may be for either time extend or absolute + * EXTEND - wants a time extend + * ABSOLUTE - the buffer requests all events to have absolute time stamps * FORCE - force a full time stamp. */ enum { - RB_ADD_STAMP_NONE, - RB_ADD_STAMP_NORMAL, - RB_ADD_STAMP_FORCE + RB_ADD_STAMP_NONE = 0, + RB_ADD_STAMP_EXTEND = BIT(1), + RB_ADD_STAMP_ABSOLUTE = BIT(2), + RB_ADD_STAMP_FORCE = BIT(3) }; /* * Used for which event context the event is in. @@ -2434,8 +2436,8 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) { - bool abs = info->add_timestamp == RB_ADD_STAMP_FORCE || - ring_buffer_time_stamp_abs(cpu_buffer->buffer); + bool abs = info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); length -= RB_LEN_TIME_EXTEND; @@ -2884,8 +2886,8 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer, EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static noinline void -rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, - struct rb_event_info *info) +rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) { WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", @@ -2897,7 +2899,6 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); - info->add_timestamp = RB_ADD_STAMP_NORMAL; } static struct ring_buffer_event * @@ -2908,7 +2909,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page; unsigned long tail, write, w; u64 before, after; - bool abs = false; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); @@ -2922,20 +2922,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { info->delta = info->ts; - abs = true; + info->add_timestamp = RB_ADD_STAMP_ABSOLUTE; } else { info->delta = info->ts - after; } - if (unlikely(test_time_stamp(info->delta))) - rb_handle_timestamp(cpu_buffer, info); + if (unlikely(test_time_stamp(info->delta))) { + rb_check_timestamp(cpu_buffer, info); + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + } /* * If interrupting an event time update, we may need an absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ if (unlikely(before != after && w)) - info->add_timestamp = RB_ADD_STAMP_FORCE; + info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; + /* * If the time delta since the last event is too big to * hold in the time field of the event, then we append a @@ -2972,7 +2975,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /*D*/ local64_set(&cpu_buffer->write_stamp, info->ts); barrier(); /*E*/ save_before = local64_read(&cpu_buffer->before_stamp); - if (likely(info->add_timestamp != RB_ADD_STAMP_FORCE)) + if (likely(!(info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ info->delta = info->ts - after; else @@ -3015,15 +3019,15 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ info->delta = 0; } - if (info->add_timestamp == RB_ADD_STAMP_FORCE) - info->add_timestamp = RB_ADD_STAMP_NORMAL; + info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (unlikely(!tail && info->add_timestamp != RB_ADD_STAMP_FORCE && !abs)) + if (unlikely(!tail && !(info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) info->delta = 0; /* We reserved something on the buffer */ -- cgit v1.2.3 From 01c66c48d4f0825a202d4163800b706a1d2ec7ad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 30 Jun 2020 10:12:40 -0700 Subject: bpf: Fix an incorrect branch elimination by verifier Wenbo reported an issue in [1] where a checking of null pointer is evaluated as always false. In this particular case, the program type is tp_btf and the pointer to compare is a PTR_TO_BTF_ID. The current verifier considers PTR_TO_BTF_ID always reprents a non-null pointer, hence all PTR_TO_BTF_ID compares to 0 will be evaluated as always not-equal, which resulted in the branch elimination. For example, struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { if (arg == 0) test7_result = 1; return 0; } int BPF_PROG(test8, struct bpf_fentry_test_t *arg) { if (arg->a == 0) test8_result = 1; return 0; } In above bpf programs, both branch arg == 0 and arg->a == 0 are removed. This may not be what developer expected. The bug is introduced by Commit cac616db39c2 ("bpf: Verifier track null pointer branch_taken with JNE and JEQ"), where PTR_TO_BTF_ID is considered to be non-null when evaluting pointer vs. scalar comparison. This may be added considering we have PTR_TO_BTF_ID_OR_NULL in the verifier as well. PTR_TO_BTF_ID_OR_NULL is added to explicitly requires a non-NULL testing in selective cases. The current generic pointer tracing framework in verifier always assigns PTR_TO_BTF_ID so users does not need to check NULL pointer at every pointer level like a->b->c->d. We may not want to assign every PTR_TO_BTF_ID as PTR_TO_BTF_ID_OR_NULL as this will require a null test before pointer dereference which may cause inconvenience for developers. But we could avoid branch elimination to preserve original code intention. This patch simply removed PTR_TO_BTD_ID from reg_type_not_null() in verifier, which prevented the above branches from being eliminated. [1]: https://lore.kernel.org/bpf/79dbb7c0-449d-83eb-5f4f-7af0cc269168@fb.com/T/ Fixes: cac616db39c2 ("bpf: Verifier track null pointer branch_taken with JNE and JEQ") Reported-by: Wenbo Zhang Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630171240.2523722-1-yhs@fb.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8911d0576399..94cead5a43e5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -399,8 +399,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || - type == PTR_TO_SOCK_COMMON || - type == PTR_TO_BTF_ID; + type == PTR_TO_SOCK_COMMON; } static bool reg_type_may_be_null(enum bpf_reg_type type) -- cgit v1.2.3 From 10464b4aa605ef93c937452f442e74cc0a4a6ceb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 28 Jun 2020 22:52:27 -0400 Subject: ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 270 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 243 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7ee6619951ea..802bb38d9c81 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,7 +27,6 @@ #include #include -#include static void update_pages_handler(struct work_struct *work); @@ -449,6 +448,28 @@ enum { RB_CTX_MAX }; +#if BITS_PER_LONG == 32 +#define RB_TIME_32 +#endif + +/* To test on 64 bit machines */ +//#define RB_TIME_32 + +#ifdef RB_TIME_32 + +struct rb_time_struct { + local_t cnt; + local_t top; + local_t bottom; +}; +#else +#include +struct rb_time_struct { + local64_t time; +}; +#endif +typedef struct rb_time_struct rb_time_t; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -484,8 +505,8 @@ struct ring_buffer_per_cpu { size_t shortest_full; unsigned long read; unsigned long read_bytes; - local64_t write_stamp; - local64_t before_stamp; + rb_time_t write_stamp; + rb_time_t before_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -528,6 +549,189 @@ struct ring_buffer_iter { int missed_events; }; +#ifdef RB_TIME_32 + +/* + * On 32 bit machines, local64_t is very expensive. As the ring + * buffer doesn't need all the features of a true 64 bit atomic, + * on 32 bit, it uses these functions (64 still uses local64_t). + * + * For the ring buffer, 64 bit required operations for the time is + * the following: + * + * - Only need 59 bits (uses 60 to make it even). + * - Reads may fail if it interrupted a modification of the time stamp. + * It will succeed if it did not interrupt another write even if + * the read itself is interrupted by a write. + * It returns whether it was successful or not. + * + * - Writes always succeed and will overwrite other writes and writes + * that were done by events interrupting the current write. + * + * - A write followed by a read of the same time stamp will always succeed, + * but may not contain the same value. + * + * - A cmpxchg will fail if it interrupted another write or cmpxchg. + * Other than that, it acts like a normal cmpxchg. + * + * The 60 bit time stamp is broken up by 30 bits in a top and bottom half + * (bottom being the least significant 30 bits of the 60 bit time stamp). + * + * The two most significant bits of each half holds a 2 bit counter (0-3). + * Each update will increment this counter by one. + * When reading the top and bottom, if the two counter bits match then the + * top and bottom together make a valid 60 bit number. + */ +#define RB_TIME_SHIFT 30 +#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) + +static inline int rb_time_cnt(unsigned long val) +{ + return (val >> RB_TIME_SHIFT) & 3; +} + +static inline u64 rb_time_val(unsigned long top, unsigned long bottom) +{ + u64 val; + + val = top & RB_TIME_VAL_MASK; + val <<= RB_TIME_SHIFT; + val |= bottom & RB_TIME_VAL_MASK; + + return val; +} + +static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) +{ + unsigned long top, bottom; + unsigned long c; + + /* + * If the read is interrupted by a write, then the cnt will + * be different. Loop until both top and bottom have been read + * without interruption. + */ + do { + c = local_read(&t->cnt); + top = local_read(&t->top); + bottom = local_read(&t->bottom); + } while (c != local_read(&t->cnt)); + + *cnt = rb_time_cnt(top); + + /* If top and bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(bottom)) + return false; + + *ret = rb_time_val(top, bottom); + return true; +} + +static bool rb_time_read(rb_time_t *t, u64 *ret) +{ + unsigned long cnt; + + return __rb_time_read(t, ret, &cnt); +} + +static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) +{ + return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); +} + +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +{ + *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); + *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); +} + +static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) +{ + val = rb_time_val_cnt(val, cnt); + local_set(t, val); +} + +static void rb_time_set(rb_time_t *t, u64 val) +{ + unsigned long cnt, top, bottom; + + rb_time_split(val, &top, &bottom); + + /* Writes always succeed with a valid number even if it gets interrupted. */ + do { + cnt = local_inc_return(&t->cnt); + rb_time_val_set(&t->top, top, cnt); + rb_time_val_set(&t->bottom, bottom, cnt); + } while (cnt != local_read(&t->cnt)); +} + +static inline bool +rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) +{ + unsigned long ret; + + ret = local_cmpxchg(l, expect, set); + return ret == expect; +} + +static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + unsigned long cnt, top, bottom; + unsigned long cnt2, top2, bottom2; + u64 val; + + /* The cmpxchg always fails if it interrupted an update */ + if (!__rb_time_read(t, &val, &cnt2)) + return false; + + if (val != expect) + return false; + + cnt = local_read(&t->cnt); + if ((cnt & 3) != cnt2) + return false; + + cnt2 = cnt + 1; + + rb_time_split(val, &top, &bottom); + top = rb_time_val_cnt(top, cnt); + bottom = rb_time_val_cnt(bottom, cnt); + + rb_time_split(set, &top2, &bottom2); + top2 = rb_time_val_cnt(top2, cnt2); + bottom2 = rb_time_val_cnt(bottom2, cnt2); + + if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) + return false; + if (!rb_time_read_cmpxchg(&t->top, top, top2)) + return false; + if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) + return false; + return true; +} + +#else /* 64 bits */ + +/* local64_t always succeeds */ + +static inline bool rb_time_read(rb_time_t *t, u64 *ret) +{ + *ret = local64_read(&t->time); + return true; +} +static void rb_time_set(rb_time_t *t, u64 val) +{ + local64_set(&t->time, val); +} + +static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + u64 val; + val = local64_cmpxchg(&t->time, expect, set); + return val == expect; +} +#endif + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from @@ -2545,7 +2749,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, delta = rb_time_delta(event); - write_stamp = local64_read(&cpu_buffer->write_stamp); + if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) + return 0; /* Make sure the write stamp is read before testing the location */ barrier(); @@ -2554,11 +2759,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); - u64 ret; - ret = local64_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta); /* Something came in, can't discard */ - if (ret != write_stamp) + if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, + write_stamp, write_stamp - delta)) return 0; /* @@ -2889,11 +3093,13 @@ static noinline void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { + u64 write_stamp; + WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, - (unsigned long long)local64_read(&cpu_buffer->write_stamp), + (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -2909,14 +3115,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page; unsigned long tail, write, w; u64 before, after; + bool a_ok; + bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - before = local64_read(&cpu_buffer->before_stamp); - after = local64_read(&cpu_buffer->write_stamp); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); @@ -2927,16 +3135,18 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->delta = info->ts - after; } - if (unlikely(test_time_stamp(info->delta))) { - rb_check_timestamp(cpu_buffer, info); - info->add_timestamp |= RB_ADD_STAMP_EXTEND; + if (likely(a_ok && b_ok)) { + if (unlikely(test_time_stamp(info->delta))) { + rb_check_timestamp(cpu_buffer, info); + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + } } /* * If interrupting an event time update, we may need an absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ - if (unlikely(before != after && w)) + if (unlikely(!a_ok || !b_ok || (before != after && w))) info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; /* @@ -2947,7 +3157,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - /*B*/ local64_set(&cpu_buffer->before_stamp, info->ts); + /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); @@ -2960,21 +3170,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(write > BUF_PAGE_SIZE)) { if (tail != w) { /* before and after may now different, fix it up*/ - before = local64_read(&cpu_buffer->before_stamp); - after = local64_read(&cpu_buffer->write_stamp); - if (before != after) - (void)local64_cmpxchg(&cpu_buffer->before_stamp, before, after); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + if (a_ok && b_ok && before != after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, before, after); } return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { u64 save_before; + bool s_ok; /* Nothing interrupted us between A and C */ - /*D*/ local64_set(&cpu_buffer->write_stamp, info->ts); + /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); barrier(); - /*E*/ save_before = local64_read(&cpu_buffer->before_stamp); + /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before); + RB_WARN_ON(cpu_buffer, !s_ok); if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ @@ -2986,27 +3198,31 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->ts != save_before)) { /* SLOW PATH - Interrupted between C and E */ - after = local64_read(&cpu_buffer->write_stamp); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + RB_WARN_ON(cpu_buffer, !a_ok); + /* Write stamp must only go forward */ if (save_before > after) { /* * We do not care about the result, only that * it gets updated atomically. */ - (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, save_before); + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, save_before); } } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - after = local64_read(&cpu_buffer->write_stamp); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + /* Was interrupted before here, write_stamp must be valid */ + RB_WARN_ON(cpu_buffer, !a_ok); ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && after < ts) { /* Nothing came after this event between C and E */ info->delta = ts - after; - (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); info->ts = ts; } else { /* @@ -4565,8 +4781,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; - local64_set(&cpu_buffer->write_stamp, 0); - local64_set(&cpu_buffer->before_stamp, 0); + rb_time_set(&cpu_buffer->write_stamp, 0); + rb_time_set(&cpu_buffer->before_stamp, 0); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; -- cgit v1.2.3 From b23d7a5f4a07af02343cdd28fe1f7488bac3afda Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 25 Jun 2020 15:34:03 +1000 Subject: ring-buffer: speed up buffer resets by avoiding synchronize_rcu for each CPU On a 144 thread system, `perf ftrace` takes about 20 seconds to start up, due to calling synchronize_rcu() for each CPU. cat /proc/108560/stack 0xc0003e7eb336f470 __switch_to+0x2e0/0x480 __wait_rcu_gp+0x20c/0x220 synchronize_rcu+0x9c/0xc0 ring_buffer_reset_cpu+0x88/0x2e0 tracing_reset_online_cpus+0x84/0xe0 tracing_open+0x1d4/0x1f0 On a system with 10x more threads, it starts to become an annoyance. Batch these up so we disable all the per-cpu buffers first, then synchronize_rcu() once, then reset each of the buffers. This brings the time down to about 0.5s. Link: https://lkml.kernel.org/r/20200625053403.2386972-1-npiggin@gmail.com Tested-by: Anton Blanchard Acked-by: Paul E. McKenney Signed-off-by: Nicholas Piggin Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 85 ++++++++++++++++++++++++++++++++++++++-------- kernel/trace/trace.c | 4 +-- 2 files changed, 72 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 802bb38d9c81..ed1941304f69 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -270,6 +270,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) +#define for_each_online_buffer_cpu(buffer, cpu) \ + for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) + #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) @@ -4790,6 +4793,26 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) rb_head_page_activate(cpu_buffer); } +/* Must have disabled the cpu buffer then done a synchronize_rcu */ +static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + + arch_spin_lock(&cpu_buffer->lock); + + rb_reset_cpu(cpu_buffer); + + arch_spin_unlock(&cpu_buffer->lock); + + out: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} + /** * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of @@ -4798,7 +4821,6 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -4809,24 +4831,42 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) /* Make sure all commits have finished */ synchronize_rcu(); - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + reset_disabled_cpu_buffer(cpu_buffer); - if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); - arch_spin_lock(&cpu_buffer->lock); +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; - rb_reset_cpu(cpu_buffer); + for_each_online_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; - arch_spin_unlock(&cpu_buffer->lock); + atomic_inc(&cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + } - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + /* Make sure all commits have finished */ + synchronize_rcu(); - atomic_dec(&cpu_buffer->record_disabled); - atomic_dec(&cpu_buffer->resize_disabled); + for_each_online_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + } } -EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** * ring_buffer_reset - reset a ring buffer @@ -4834,10 +4874,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); */ void ring_buffer_reset(struct trace_buffer *buffer) { + struct ring_buffer_per_cpu *cpu_buffer; int cpu; - for_each_buffer_cpu(buffer, cpu) - ring_buffer_reset_cpu(buffer, cpu); + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + atomic_inc(&cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + } + + /* Make sure all commits have finished */ + synchronize_rcu(); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + } } EXPORT_SYMBOL_GPL(ring_buffer_reset); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64c5b8146cca..4aab712f9567 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2003,7 +2003,6 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu) void tracing_reset_online_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; - int cpu; if (!buffer) return; @@ -2015,8 +2014,7 @@ void tracing_reset_online_cpus(struct array_buffer *buf) buf->time_start = buffer_ftrace_now(buf, buf->cpu); - for_each_online_cpu(cpu) - ring_buffer_reset_cpu(buffer, cpu); + ring_buffer_reset_online_cpus(buffer); ring_buffer_record_enable(buffer); } -- cgit v1.2.3 From 75b21c6dfa2d816bcabac24a530d9cffc12092b8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Jun 2020 09:04:35 -0400 Subject: ring-buffer: Mark the !tail (crossing a page) as unlikely It is the uncommon case where an event crosses a sub buffer boundary (page) mark that check at the end of reserving an event as unlikely. Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ed1941304f69..888bc9177937 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3260,7 +3260,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * If this is the first commit on the page, then update * its timestamp. */ - if (!tail) + if (unlikely(!tail)) tail_page->page->time_stamp = info->ts; /* account for these added bytes */ -- cgit v1.2.3 From fb37409a01b011a664347702f44dbf13fa7c7486 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:45:20 +0300 Subject: arch: remove unicore32 port The unicore32 port do not seem maintained for a long time now, there is no upstream toolchain that can create unicore32 binaries and all the links to prebuilt toolchains for unicore32 are dead. Even compilers that were available are not supported by the kernel anymore. Guenter Roeck says: I have stopped building unicore32 images since v4.19 since there is no available compiler that is still supported by the kernel. I am surprised that support for it has not been removed from the kernel. Remove unicore32 port. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- kernel/reboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 491f1347bf43..e7b78d5ae1ab 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -26,7 +26,7 @@ int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); -#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#if defined(CONFIG_ARM) #define DEFAULT_REBOOT_MODE = REBOOT_HARD #else #define DEFAULT_REBOOT_MODE -- cgit v1.2.3 From bba1dc0b55ac462d24ed1228ad49800c238cd6d7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 29 Jun 2020 21:33:39 -0700 Subject: bpf: Remove redundant synchronize_rcu. bpf_free_used_maps() or close(map_fd) will trigger map_free callback. bpf_free_used_maps() is called after bpf prog is no longer executing: bpf_prog_put->call_rcu->bpf_prog_free->bpf_free_used_maps. Hence there is no need to call synchronize_rcu() to protect map elements. Note that hash_of_maps and array_of_maps update/delete inner maps via sys_bpf() that calls maybe_wait_bpf_programs() and synchronize_rcu(). Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200630043343.53195-2-alexei.starovoitov@gmail.com --- kernel/bpf/arraymap.c | 9 --------- kernel/bpf/hashtab.c | 8 +++----- kernel/bpf/lpm_trie.c | 5 ----- kernel/bpf/queue_stack_maps.c | 7 ------- kernel/bpf/reuseport_array.c | 2 -- kernel/bpf/ringbuf.c | 7 ------- kernel/bpf/stackmap.c | 3 --- 7 files changed, 3 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ec5cd11032aa..c66e8273fccd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -386,13 +386,6 @@ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding programs to complete - * and free the array - */ - synchronize_rcu(); - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); @@ -546,8 +539,6 @@ static void fd_array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - synchronize_rcu(); - /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index acd06081d81d..d4378d7d442b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1290,12 +1290,10 @@ static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete + /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. + * bpf_free_used_maps() is called after bpf prog is no longer executing. + * There is no need to synchronize_rcu() here to protect map elements. */ - synchronize_rcu(); /* some of free_htab_elem() callbacks for elements of this map may * not have executed. Wait for them. diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1abd4e3f906d..44474bf3ab7a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -589,11 +589,6 @@ static void trie_free(struct bpf_map *map) struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; - /* Wait for outstanding programs to complete - * update/lookup/delete/get_next_key and free the trie. - */ - synchronize_rcu(); - /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent * and start over. diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 80c66a6d7c54..44184f82916a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -101,13 +101,6 @@ static void queue_stack_map_free(struct bpf_map *map) { struct bpf_queue_stack *qs = bpf_queue_stack(map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - bpf_map_area_free(qs); } diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a09922f656e4..3625c4fcc65c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -96,8 +96,6 @@ static void reuseport_array_free(struct bpf_map *map) struct sock *sk; u32 i; - synchronize_rcu(); - /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index dbf37aff4827..13a8d3967e07 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -215,13 +215,6 @@ static void ringbuf_map_free(struct bpf_map *map) { struct bpf_ringbuf_map *rb_map; - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); kfree(rb_map); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 27dc9b1b08a5..071f98d0f7c6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -604,9 +604,6 @@ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - /* wait for bpf programs to complete before freeing stack map */ - synchronize_rcu(); - bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); -- cgit v1.2.3 From d141b8bc5773cbbaf5b8530f08f94fc10fff9e8c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:43 -0700 Subject: perf: Expose get/put_callchain_entry() Sanitize and expose get/put_callchain_entry(). This would be used by bpf stack map. Suggested-by: Peter Zijlstra Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-2-songliubraving@fb.com --- kernel/events/callchain.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 334d48b16c36..c6ce894e4ce9 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -149,7 +149,7 @@ void put_callchain_buffers(void) } } -static struct perf_callchain_entry *get_callchain_entry(int *rctx) +struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; @@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) return NULL; entries = rcu_dereference(callchain_cpus_entries); - if (!entries) + if (!entries) { + put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; + } cpu = smp_processor_id(); @@ -168,7 +170,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) (*rctx * perf_callchain_entry__sizeof())); } -static void +void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); @@ -183,11 +185,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, int rctx; entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - if (!entry) - goto exit_put; + return NULL; ctx.entry = entry; ctx.max_stack = max_stack; -- cgit v1.2.3 From fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:44 -0700 Subject: bpf: Introduce helper bpf_get_task_stack() Introduce helper bpf_get_task_stack(), which dumps stack trace of given task. This is different to bpf_get_stack(), which gets stack track of current task. One potential use case of bpf_get_task_stack() is to call it from bpf_iter__task and dump all /proc//stack to a seq_file. bpf_get_task_stack() uses stack_trace_save_tsk() instead of get_perf_callchain() for kernel stack. The benefit of this choice is that stack_trace_save_tsk() doesn't require changes in arch/. The downside of using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the stack trace to unsigned long array. For 32-bit systems, we need to translate it to u64 array. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-3-songliubraving@fb.com --- kernel/bpf/stackmap.c | 77 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/bpf/verifier.c | 4 ++- kernel/trace/bpf_trace.c | 2 ++ 3 files changed, 78 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 071f98d0f7c6..5ad72ab2276b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } } +static struct perf_callchain_entry * +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +{ + struct perf_callchain_entry *entry; + int rctx; + + entry = get_callchain_entry(&rctx); + + if (!entry) + return NULL; + + entry->nr = init_nr + + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), + sysctl_perf_event_max_stack - init_nr, 0); + + /* stack_trace_save_tsk() works on unsigned long array, while + * perf_callchain_entry uses u64 array. For 32-bit systems, it is + * necessary to fix this mismatch. + */ + if (__BITS_PER_LONG != 64) { + unsigned long *from = (unsigned long *) entry->ip; + u64 *to = entry->ip; + int i; + + /* copy data from the end to avoid using extra buffer */ + for (i = entry->nr - 1; i >= (int)init_nr; i--) + to[i] = (u64)(from[i]); + } + + put_callchain_entry(rctx); + + return entry; +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { @@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, - u64, flags) +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, if (unlikely(size % elem_size)) goto clear; + /* cannot get valid user stack for task without user_mode regs */ + if (task && user && !user_mode(regs)) + goto err_fault; + num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + + if (kernel && task) + trace = get_callchain_entry_for_task(task, init_nr); + else + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, + false, false); if (unlikely(!trace)) goto err_fault; @@ -505,6 +548,12 @@ clear: return err; } +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, buf, size, flags); +} + const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, @@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + struct pt_regs *regs = task_pt_regs(task); + + return __bpf_get_stack(regs, task, buf, size, flags); +} + +static int bpf_get_task_stack_btf_ids[5]; +const struct bpf_func_proto bpf_get_task_stack_proto = { + .func = bpf_get_task_stack, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_get_task_stack_btf_ids, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7de98906ddf4..b608185e1ffd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + if ((func_id == BPF_FUNC_get_stack || + func_id == BPF_FUNC_get_task_stack) && + !env->prog->has_callchain_buf) { const char *err_str; #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5d59dda5f661..977ba3b6f6c6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_query_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; default: return NULL; } -- cgit v1.2.3 From 2df6bb5493f83c7e818f66c384ea337c1b3da228 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:45 -0700 Subject: bpf: Allow %pB in bpf_seq_printf() and bpf_trace_printk() This makes it easy to dump stack trace in text. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-4-songliubraving@fb.com --- kernel/trace/bpf_trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 977ba3b6f6c6..1d874d8e4384 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -376,7 +376,7 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, /* * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) @@ -420,6 +420,11 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, goto fmt_str; } + if (fmt[i + 1] == 'B') { + i++; + goto fmt_next; + } + /* disallow any further format extensions */ if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && @@ -636,7 +641,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, if (fmt[i] == 'p') { if (fmt[i + 1] == 0 || fmt[i + 1] == 'K' || - fmt[i + 1] == 'x') { + fmt[i + 1] == 'x' || + fmt[i + 1] == 'B') { /* just kernel pointers */ params[fmt_cnt] = args[fmt_cnt]; fmt_cnt++; -- cgit v1.2.3 From 58fbc3c63275c6255a10ae151cc3882d9190c4e0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Jun 2020 08:59:26 -0400 Subject: ring-buffer: Consolidate add_timestamp to remove some branches Reorganize a little the logic to handle adding the absolute time stamp, extended and forced time stamps, in such a way to remove a branch or two. This is just a micro optimization. Also add before and after time stamps to the rb_event_info structure to display those values in the rb_check_timestamps() code, if something were to go wrong. Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 139 ++++++++++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 888bc9177937..ce125cbe98a5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -416,6 +416,8 @@ struct rb_irq_work { struct rb_event_info { u64 ts; u64 delta; + u64 before; + u64 after; unsigned long length; struct buffer_page *tail_page; int add_timestamp; @@ -2619,6 +2621,33 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static noinline void +rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + u64 write_stamp; + + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)info->before, + (unsigned long long)info->after, + (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); +} + /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event @@ -2646,6 +2675,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, bool abs = info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); + rb_check_timestamp(cpu_buffer, info); event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); length -= RB_LEN_TIME_EXTEND; delta = 0; @@ -2692,13 +2722,6 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) -{ - return true; -} -#endif - static __always_inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) @@ -3092,24 +3115,6 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); -static noinline void -rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, - struct rb_event_info *info) -{ - u64 write_stamp; - - WARN_ONCE(info->delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)info->delta, - (unsigned long long)info->ts, - (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), - sched_clock_stable() ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n" - "or add trace_clock=global to the kernel command line\n"); -} - static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -3117,7 +3122,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; - u64 before, after; bool a_ok; bool b_ok; @@ -3126,40 +3130,31 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); - if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { + if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) { info->delta = info->ts; - info->add_timestamp = RB_ADD_STAMP_ABSOLUTE; } else { - info->delta = info->ts - after; - } - - if (likely(a_ok && b_ok)) { - if (unlikely(test_time_stamp(info->delta))) { - rb_check_timestamp(cpu_buffer, info); - info->add_timestamp |= RB_ADD_STAMP_EXTEND; + /* + * If interrupting an event time update, we may need an + * absolute timestamp. + * Don't bother if this is the start of a new page (w == 0). + */ + if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) { + info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; + info->length += RB_LEN_TIME_EXTEND; + } else { + info->delta = info->ts - info->after; + if (unlikely(test_time_stamp(info->delta))) { + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + info->length += RB_LEN_TIME_EXTEND; + } } } - /* - * If interrupting an event time update, we may need an absolute timestamp. - * Don't bother if this is the start of a new page (w == 0). - */ - if (unlikely(!a_ok || !b_ok || (before != after && w))) - info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; - - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(info->add_timestamp)) - info->length += RB_LEN_TIME_EXTEND; - /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); @@ -3173,10 +3168,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(write > BUF_PAGE_SIZE)) { if (tail != w) { /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); - if (a_ok && b_ok && before != after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, before, after); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + if (a_ok && b_ok && info->before != info->after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, + info->before, info->after); } return rb_move_tail(cpu_buffer, tail, info); } @@ -3193,7 +3189,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ - info->delta = info->ts - after; + info->delta = info->ts - info->after; else /* Just use full timestamp for inerrupting event */ info->delta = info->ts; @@ -3201,31 +3197,33 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->ts != save_before)) { /* SLOW PATH - Interrupted between C and E */ - a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); RB_WARN_ON(cpu_buffer, !a_ok); /* Write stamp must only go forward */ - if (save_before > after) { + if (save_before > info->after) { /* * We do not care about the result, only that * it gets updated atomically. */ - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, save_before); + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, save_before); } } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); /* Was interrupted before here, write_stamp must be valid */ RB_WARN_ON(cpu_buffer, !a_ok); ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - after < ts) { + info->after < ts) { /* Nothing came after this event between C and E */ - info->delta = ts - after; - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); + info->delta = ts - info->after; + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, info->ts); info->ts = ts; } else { /* @@ -3277,6 +3275,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; + int add_ts_default; rb_start_commit(cpu_buffer); /* The commit page can not change after this */ @@ -3297,8 +3296,16 @@ rb_reserve_next_event(struct trace_buffer *buffer, #endif info.length = rb_calculate_event_length(length); + + if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { + add_ts_default = RB_ADD_STAMP_ABSOLUTE; + info.length += RB_LEN_TIME_EXTEND; + } else { + add_ts_default = RB_ADD_STAMP_NONE; + } + again: - info.add_timestamp = RB_ADD_STAMP_NONE; + info.add_timestamp = add_ts_default; info.delta = 0; /* @@ -3316,7 +3323,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { - if (info.add_timestamp) + if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND)) info.length -= RB_LEN_TIME_EXTEND; goto again; } -- cgit v1.2.3 From 74e879373b377f15d4ecb45bf8316b77e8badc49 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Jun 2020 12:47:56 -0400 Subject: ring-buffer: Move the add_timestamp into its own function Make a helper function rb_add_timestamp() that moves the adding of the extended time stamps into its own function. Also, remove the noinline and inline for the functions it calls, as recent benchmarks appear they do not make a difference (just let gcc decide). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce125cbe98a5..a30ca7ec2200 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2596,8 +2596,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * +/* Slow path */ +static struct ring_buffer_event * rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) @@ -2628,7 +2628,7 @@ static inline bool sched_clock_stable(void) } #endif -static noinline void +static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { @@ -2648,6 +2648,21 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, "or add trace_clock=global to the kernel command line\n"); } +static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event **event, + struct rb_event_info *info, + u64 *delta, + unsigned int *length) +{ + bool abs = info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); + + rb_check_timestamp(cpu_buffer, info); + *event = rb_add_time_stamp(*event, info->delta, abs); + *length -= RB_LEN_TIME_EXTEND; + *delta = 0; +} + /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event @@ -2671,15 +2686,8 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * If we need to add a timestamp, then we * add it to the start of the reserved space. */ - if (unlikely(info->add_timestamp)) { - bool abs = info->add_timestamp & - (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); - - rb_check_timestamp(cpu_buffer, info); - event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } + if (unlikely(info->add_timestamp)) + rb_add_timestamp(cpu_buffer, &event, info, &delta, &length); event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; -- cgit v1.2.3 From bbeba3e58f040a4297a5ba88ebf6e2b16adc3657 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Jun 2020 13:05:29 -0400 Subject: ring-buffer: Call trace_clock_local() directly for RETPOLINE kernels After doing some benchmarks and examining the code, I found that the ring buffer clock calls were quite expensive, and noticed that it uses retpolines. This is because the ring buffer clock is programmable, and can be set. But in most cases it simply uses the fastest ns unit clock which is the trace_clock_local(). For RETPOLINE builds, checking if the ring buffer clock is set to trace_clock_local() and then calling it directly has brought the time of an event on my i7 box from an average of 93 nanoseconds an event down to 83 nanoseconds an event, and the minimum time from 81 nanoseconds to 68 nanoseconds! Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a30ca7ec2200..2bb96ee80120 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -970,8 +970,16 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, static inline u64 rb_time_stamp(struct trace_buffer *buffer) { + u64 ts; + + /* Skip retpolines :-( */ + if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local)) + ts = trace_clock_local(); + else + ts = buffer->clock(); + /* shift to debug/test normalization and TIME_EXTENTS */ - return buffer->clock() << DEBUG_SHIFT; + return ts << DEBUG_SHIFT; } u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) -- cgit v1.2.3 From 29ce24519c0692ca7d998d7444a9e016a4c44fa7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Jul 2020 13:10:19 -0400 Subject: ring-buffer: Do not trigger a WARN if clock going backwards is detected After tweaking the ring buffer to be a bit faster, a warning is triggering on one of my machines, and causing my tests to fail. This warning is caused when the delta (current time stamp minus previous time stamp), is larger than the max time held by the ring buffer (59 bits). If the clock were to go backwards slightly, this would then easily trigger this warning. The machine that it triggered on, the clock did go backwards by around 450 nanoseconds, and this happened after a recalibration of the TSC clock. Now that the ring buffer is faster, it detects this, and the delta that is used larger than the max, the warning is triggered and my test fails. To handle the clock going backwards, look at the saved before and after time stamps. If they are the same, it means that the current event did not interrupt another event, and that those timestamp are of a previous event that was recorded. If the max delta is triggered, look at those time stamps, make sure they are the same, then use them to compare with the current timestamp. If the current timestamp is less than the before/after time stamps, then that means the clock being used went backward. Print out a message that this has happened, but do not warn about it (and only print the message once). Still do the warning if the delta is indeed larger than what can be used. Also remove the unneeded KERN_WARNING from the WARN_ONCE() print. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2bb96ee80120..c3a2e7509527 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2642,8 +2642,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 write_stamp; - WARN_ONCE(info->delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", + WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, (unsigned long long)info->before, @@ -2665,7 +2664,26 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, bool abs = info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); - rb_check_timestamp(cpu_buffer, info); + if (unlikely(info->delta > (1ULL << 59))) { + /* did the clock go backwards */ + if (info->before == info->after && info->before > info->ts) { + /* not interrupted */ + static int once; + + /* + * This is possible with a recalibrating of the TSC. + * Do not produce a call stack, but just report it. + */ + if (!once) { + once++; + pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", + info->before, info->ts); + } + } else + rb_check_timestamp(cpu_buffer, info); + if (!abs) + info->delta = 0; + } *event = rb_add_time_stamp(*event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; -- cgit v1.2.3 From 10dd8573b09e84b81539d939d55ebdb6a36c5f3a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 29 Jun 2020 13:54:59 +0530 Subject: cpufreq: Register governors at core_initcall Currently, most CPUFreq governors are registered at the core_initcall time when the given governor is the default one, and the module_init time otherwise. In preparation for letting users specify the default governor on the kernel command line, change all of them to be registered at the core_initcall unconditionally, as it is already the case for the schedutil and performance governors. This will allow us to assume that builtin governors have been registered before the built-in CPUFreq drivers probe. And since all governors have similar init/exit patterns now, introduce two new macros, cpufreq_governor_{init,exit}(), to factorize the code. Acked-by: Viresh Kumar Signed-off-by: Quentin Perret Signed-off-by: Viresh Kumar [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..402a09af9f43 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -static int __init sugov_register(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} -core_initcall(sugov_register); +cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL extern bool sched_energy_update; -- cgit v1.2.3 From 1d50e5d0c5052446cb85a3bf11fe8ba4e8d770ca Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Thu, 14 May 2020 00:22:36 +0530 Subject: crash_core, vmcoreinfo: Append 'MAX_PHYSMEM_BITS' to vmcoreinfo Right now user-space tools like 'makedumpfile' and 'crash' need to rely on a best-guess method of determining value of 'MAX_PHYSMEM_BITS' supported by underlying kernel. This value is used in user-space code to calculate the bit-space required to store a section for SPARESMEM (similar to the existing calculation method used in the kernel implementation): #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) Now, regressions have been reported in user-space utilities like 'makedumpfile' and 'crash' on arm64, with the recently added kernel support for 52-bit physical address space, as there is no clear method of determining this value in user-space (other than reading kernel CONFIG flags). As per suggestion from makedumpfile maintainer (Kazu), it makes more sense to append 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself rather than in arch-specific code, so that the user-space code for other archs can also benefit from this addition to the vmcoreinfo and use it as a standard way of determining 'SECTIONS_SHIFT' value in user-land. A reference 'makedumpfile' implementation which reads the 'MAX_PHYSMEM_BITS' value from vmcoreinfo in a arch-independent fashion is available here: While at it also update vmcoreinfo documentation for 'MAX_PHYSMEM_BITS' variable being added to vmcoreinfo. 'MAX_PHYSMEM_BITS' defines the maximum supported physical address space memory. Signed-off-by: Bhupesh Sharma Tested-by: John Donnelly Acked-by: Dave Young Cc: Boris Petkov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: James Morse Cc: Mark Rutland Cc: Will Deacon Cc: Michael Ellerman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Dave Anderson Cc: Kazuhito Hagio Cc: x86@kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kexec@lists.infradead.org Link: https://lore.kernel.org/r/1589395957-24628-2-git-send-email-bhsharma@redhat.com Signed-off-by: Catalin Marinas --- kernel/crash_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 9f1557b98468..18175687133a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(pglist_data); -- cgit v1.2.3 From 046cc3dd9a2507b8e111714807ab8bf15ea5bb70 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 2 Jul 2020 19:45:37 -0700 Subject: bpf: Fix build without CONFIG_STACKTRACE Without CONFIG_STACKTRACE stack_trace_save_tsk() is not defined. Let get_callchain_entry_for_task() to always return NULL in such cases. Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()") Reported-by: kernel test robot Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200703024537.79971-1-songliubraving@fb.com --- kernel/bpf/stackmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 5ad72ab2276b..a6c361ed7937 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -351,6 +351,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, static struct perf_callchain_entry * get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) { +#ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; int rctx; @@ -380,6 +381,9 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) put_callchain_entry(rctx); return entry; +#else /* CONFIG_STACKTRACE */ + return NULL; +#endif } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, -- cgit v1.2.3 From a3a66c3822e03692ed7c5888e8f2d384cc698d34 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jul 2020 15:15:27 -0700 Subject: vmalloc: fix the owner argument for the new __vmalloc_node_range callers Fix the recently added new __vmalloc_node_range callers to pass the correct values as the owner for display in /proc/vmallocinfo. Fixes: 800e26b81311 ("x86/hyperv: allocate the hypercall page with only read and execute bits") Fixes: 10d5e97c1bf8 ("arm64: use PAGE_KERNEL_ROX directly in alloc_insn_page") Fixes: 7a0e27b2a0ce ("mm: remove vmalloc_exec") Reported-by: Ard Biesheuvel Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200627075649.2455097-1-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0c6573b98c36..bee1c25ca5c5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2785,7 +2785,7 @@ void * __weak module_alloc(unsigned long size) { return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __func__); + NUMA_NO_NODE, __builtin_return_address(0)); } bool __weak module_init_section(const char *name) -- cgit v1.2.3 From 8fa88a88d573093868565a1afba43b5ae5b3a316 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 3 Jul 2020 16:56:45 +0100 Subject: genirq: Remove preflow handler support That was put in place for sparc64, and blackfin also used it for some time; sparc64 no longer uses those, and blackfin is dead. As there are no more users, remove preflow handlers. Signed-off-by: Valentin Schneider Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200703155645.29703-3-valentin.schneider@arm.com --- kernel/irq/Kconfig | 4 ---- kernel/irq/chip.c | 13 ------------- 2 files changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20512252ecc9..10a5aff4eecc 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -51,10 +51,6 @@ config GENERIC_IRQ_INJECTION config HARDIRQS_SW_RESEND bool -# Preflow handler support for fasteoi (sparc64) -config IRQ_PREFLOW_FASTEOI - bool - # Edge style eoi based handler (cell) config IRQ_EDGE_EOI_HANDLER bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 41e7e37a0928..75bbaa8b38f1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -656,16 +656,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_level_irq); -#ifdef CONFIG_IRQ_PREFLOW_FASTEOI -static inline void preflow_handler(struct irq_desc *desc) -{ - if (desc->preflow_handler) - desc->preflow_handler(&desc->irq_data); -} -#else -static inline void preflow_handler(struct irq_desc *desc) { } -#endif - static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) { if (!(desc->istate & IRQS_ONESHOT)) { @@ -721,7 +711,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1231,7 +1220,6 @@ void handle_fasteoi_ack_irq(struct irq_desc *desc) /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1281,7 +1269,6 @@ void handle_fasteoi_mask_irq(struct irq_desc *desc) if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); -- cgit v1.2.3 From 5fec25f2cb959cb5f189d7f6127bee3efc782530 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 24 Jun 2020 16:34:57 -0500 Subject: umh: Capture the pid in umh_pipe_setup The pid in struct subprocess_info is only used by umh_clean_and_save_pid to write the pid into umh_info. Instead always capture the pid on struct umh_info in umh_pipe_setup, removing code that is specific to user mode drivers from the common user path of user mode helpers. v1: https://lkml.kernel.org/r/87h7uygf9i.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/875zb97iix.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-1-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/umh.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 79f139a7ca03..c2a582b3a2bf 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -102,7 +102,6 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - sub_info->pid = task_pid_nr(current); if (sub_info->file) { retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); @@ -468,6 +467,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) umh_info->pipe_to_umh = to_umh[1]; umh_info->pipe_from_umh = from_umh[0]; + umh_info->pid = task_pid_nr(current); return 0; } @@ -476,13 +476,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info) struct umh_info *umh_info = info->data; /* cleanup if umh_pipe_setup() was successful but exec failed */ - if (info->pid && info->retval) { + if (info->retval) { fput(umh_info->pipe_to_umh); fput(umh_info->pipe_from_umh); } argv_free(info->argv); - umh_info->pid = info->pid; } /** -- cgit v1.2.3 From b044fa2ae50d52d8c9f9d130055c2aea032e7475 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 10:04:25 -0500 Subject: umh: Move setting PF_UMH into umh_pipe_setup I am separating the code specific to user mode drivers from the code for ordinary user space helpers. Move setting of PF_UMH from call_usermodehelper_exec_async which is core user mode helper code into umh_pipe_setup which is user mode driver code. The code is equally as easy to write in one location as the other and the movement minimizes the impact of the user mode driver code on the core of the user mode helper code. Setting PF_UMH unconditionally is harmless as an action will only happen if it is paired with an entry on umh_list. v1: https://lkml.kernel.org/r/87bll6gf8t.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87zh8l63xs.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-2-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/umh.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index c2a582b3a2bf..e6b9d6636850 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -102,12 +102,10 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - if (sub_info->file) { + if (sub_info->file) retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); - if (!retval) - current->flags |= PF_UMH; - } else + else retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); @@ -468,6 +466,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) umh_info->pipe_to_umh = to_umh[1]; umh_info->pipe_from_umh = from_umh[0]; umh_info->pid = task_pid_nr(current); + current->flags |= PF_UMH; return 0; } -- cgit v1.2.3 From 3a171042aeab8702391c311d84633a6c267d566c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 10:10:03 -0500 Subject: umh: Rename the user mode driver helpers for clarity Now that the functionality of umh_setup_pipe and umh_clean_and_save_pid has changed their names are too specific and don't make much sense. Instead name them umd_setup and umd_cleanup for the functional role in setting up user mode drivers. v1: https://lkml.kernel.org/r/875zbegf82.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87tuyt63x3.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-3-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/umh.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index e6b9d6636850..26c3d493f168 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -429,7 +429,7 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file, return sub_info; } -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +static int umd_setup(struct subprocess_info *info, struct cred *new) { struct umh_info *umh_info = info->data; struct file *from_umh[2]; @@ -470,11 +470,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return 0; } -static void umh_clean_and_save_pid(struct subprocess_info *info) +static void umd_cleanup(struct subprocess_info *info) { struct umh_info *umh_info = info->data; - /* cleanup if umh_pipe_setup() was successful but exec failed */ + /* cleanup if umh_setup() was successful but exec failed */ if (info->retval) { fput(umh_info->pipe_to_umh); fput(umh_info->pipe_from_umh); @@ -520,8 +520,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) } err = -ENOMEM; - sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, - umh_clean_and_save_pid, info); + sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup, + info); if (!sub_info) goto out; -- cgit v1.2.3 From 21d598280675c463ea1b264fab06e9614aacd1e1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 24 Jun 2020 17:01:18 -0500 Subject: umh: Remove call_usermodehelper_setup_file. The only caller of call_usermodehelper_setup_file is fork_usermode_blob. In fork_usermode_blob replace call_usermodehelper_setup_file with call_usermodehelper_setup and delete fork_usermodehelper_setup_file. For this to work the argv_free is moved from umh_clean_and_save_pid to fork_usermode_blob. v1: https://lkml.kernel.org/r/87zh8qf0mp.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87o8p163u1.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-4-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/umh.c | 42 +++++++++++------------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 26c3d493f168..b8fa9b99b366 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -402,33 +402,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); -struct subprocess_info *call_usermodehelper_setup_file(struct file *file, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), void *data) -{ - struct subprocess_info *sub_info; - struct umh_info *info = data; - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; - - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); - if (!sub_info) - return NULL; - - sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); - if (!sub_info->argv) { - kfree(sub_info); - return NULL; - } - - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); - sub_info->path = "none"; - sub_info->file = file; - sub_info->init = init; - sub_info->cleanup = cleanup; - sub_info->data = data; - return sub_info; -} - static int umd_setup(struct subprocess_info *info, struct cred *new) { struct umh_info *umh_info = info->data; @@ -479,8 +452,6 @@ static void umd_cleanup(struct subprocess_info *info) fput(umh_info->pipe_to_umh); fput(umh_info->pipe_from_umh); } - - argv_free(info->argv); } /** @@ -501,7 +472,9 @@ static void umd_cleanup(struct subprocess_info *info) */ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) { + const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; struct subprocess_info *sub_info; + char **argv = NULL; struct file *file; ssize_t written; loff_t pos = 0; @@ -520,11 +493,16 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) } err = -ENOMEM; - sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup, - info); + argv = argv_split(GFP_KERNEL, cmdline, NULL); + if (!argv) + goto out; + + sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL, + umd_setup, umd_cleanup, info); if (!sub_info) goto out; + sub_info->file = file; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); if (!err) { mutex_lock(&umh_list_lock); @@ -532,6 +510,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) mutex_unlock(&umh_list_lock); } out: + if (argv) + argv_free(argv); fput(file); return err; } -- cgit v1.2.3 From 884c5e683b67dbc52892e24c29eed864f330ec08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 26 Jun 2020 12:23:00 -0500 Subject: umh: Separate the user mode driver and the user mode helper support This makes it clear which code is part of the core user mode helper support and which code is needed to implement user mode drivers. This makes the kernel smaller for everyone who does not use a usermode driver. v1: https://lkml.kernel.org/r/87tuyyf0ln.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87imf963s6.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-5-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/Makefile | 1 + kernel/exit.c | 1 + kernel/umh.c | 139 -------------------------------------------- kernel/usermode_driver.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 148 insertions(+), 139 deletions(-) create mode 100644 kernel/usermode_driver.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..43928759893a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,7 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o +obj-$(CONFIG_BPFILTER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/exit.c b/kernel/exit.c index 727150f28103..a081deea52ca 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include diff --git a/kernel/umh.c b/kernel/umh.c index b8fa9b99b366..3e4e453d45c8 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -26,8 +26,6 @@ #include #include #include -#include -#include #include @@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); -static LIST_HEAD(umh_list); -static DEFINE_MUTEX(umh_list_lock); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { @@ -402,121 +398,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); -static int umd_setup(struct subprocess_info *info, struct cred *new) -{ - struct umh_info *umh_info = info->data; - struct file *from_umh[2]; - struct file *to_umh[2]; - int err; - - /* create pipe to send data to umh */ - err = create_pipe_files(to_umh, 0); - if (err) - return err; - err = replace_fd(0, to_umh[0], 0); - fput(to_umh[0]); - if (err < 0) { - fput(to_umh[1]); - return err; - } - - /* create pipe to receive data from umh */ - err = create_pipe_files(from_umh, 0); - if (err) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - return err; - } - err = replace_fd(1, from_umh[1], 0); - fput(from_umh[1]); - if (err < 0) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - fput(from_umh[0]); - return err; - } - - umh_info->pipe_to_umh = to_umh[1]; - umh_info->pipe_from_umh = from_umh[0]; - umh_info->pid = task_pid_nr(current); - current->flags |= PF_UMH; - return 0; -} - -static void umd_cleanup(struct subprocess_info *info) -{ - struct umh_info *umh_info = info->data; - - /* cleanup if umh_setup() was successful but exec failed */ - if (info->retval) { - fput(umh_info->pipe_to_umh); - fput(umh_info->pipe_from_umh); - } -} - -/** - * fork_usermode_blob - fork a blob of bytes as a usermode process - * @data: a blob of bytes that can be do_execv-ed as a file - * @len: length of the blob - * @info: information about usermode process (shouldn't be NULL) - * - * If info->cmdline is set it will be used as command line for the - * user process, else "usermodehelper" is used. - * - * Returns either negative error or zero which indicates success - * in executing a blob of bytes as a usermode process. In such - * case 'struct umh_info *info' is populated with two pipes - * and a pid of the process. The caller is responsible for health - * check of the user process, killing it via pid, and closing the - * pipes when user process is no longer needed. - */ -int fork_usermode_blob(void *data, size_t len, struct umh_info *info) -{ - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; - struct subprocess_info *sub_info; - char **argv = NULL; - struct file *file; - ssize_t written; - loff_t pos = 0; - int err; - - file = shmem_kernel_file_setup("", len, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - written = kernel_write(file, data, len, &pos); - if (written != len) { - err = written; - if (err >= 0) - err = -ENOMEM; - goto out; - } - - err = -ENOMEM; - argv = argv_split(GFP_KERNEL, cmdline, NULL); - if (!argv) - goto out; - - sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL, - umd_setup, umd_cleanup, info); - if (!sub_info) - goto out; - - sub_info->file = file; - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - if (!err) { - mutex_lock(&umh_list_lock); - list_add(&info->list, &umh_list); - mutex_unlock(&umh_list_lock); - } -out: - if (argv) - argv_free(argv); - fput(file); - return err; -} -EXPORT_SYMBOL_GPL(fork_usermode_blob); - /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa @@ -678,26 +559,6 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } -void __exit_umh(struct task_struct *tsk) -{ - struct umh_info *info; - pid_t pid = tsk->pid; - - mutex_lock(&umh_list_lock); - list_for_each_entry(info, &umh_list, list) { - if (info->pid == pid) { - list_del(&info->list); - mutex_unlock(&umh_list_lock); - goto out; - } - } - mutex_unlock(&umh_list_lock); - return; -out: - if (info->cleanup) - info->cleanup(info); -} - struct ctl_table usermodehelper_table[] = { { .procname = "bset", diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c new file mode 100644 index 000000000000..5b05863af855 --- /dev/null +++ b/kernel/usermode_driver.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * umd - User mode driver support + */ +#include +#include +#include + +static LIST_HEAD(umh_list); +static DEFINE_MUTEX(umh_list_lock); + +static int umd_setup(struct subprocess_info *info, struct cred *new) +{ + struct umh_info *umh_info = info->data; + struct file *from_umh[2]; + struct file *to_umh[2]; + int err; + + /* create pipe to send data to umh */ + err = create_pipe_files(to_umh, 0); + if (err) + return err; + err = replace_fd(0, to_umh[0], 0); + fput(to_umh[0]); + if (err < 0) { + fput(to_umh[1]); + return err; + } + + /* create pipe to receive data from umh */ + err = create_pipe_files(from_umh, 0); + if (err) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + return err; + } + err = replace_fd(1, from_umh[1], 0); + fput(from_umh[1]); + if (err < 0) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + fput(from_umh[0]); + return err; + } + + umh_info->pipe_to_umh = to_umh[1]; + umh_info->pipe_from_umh = from_umh[0]; + umh_info->pid = task_pid_nr(current); + current->flags |= PF_UMH; + return 0; +} + +static void umd_cleanup(struct subprocess_info *info) +{ + struct umh_info *umh_info = info->data; + + /* cleanup if umh_setup() was successful but exec failed */ + if (info->retval) { + fput(umh_info->pipe_to_umh); + fput(umh_info->pipe_from_umh); + } +} + +/** + * fork_usermode_blob - fork a blob of bytes as a usermode process + * @data: a blob of bytes that can be do_execv-ed as a file + * @len: length of the blob + * @info: information about usermode process (shouldn't be NULL) + * + * If info->cmdline is set it will be used as command line for the + * user process, else "usermodehelper" is used. + * + * Returns either negative error or zero which indicates success + * in executing a blob of bytes as a usermode process. In such + * case 'struct umh_info *info' is populated with two pipes + * and a pid of the process. The caller is responsible for health + * check of the user process, killing it via pid, and closing the + * pipes when user process is no longer needed. + */ +int fork_usermode_blob(void *data, size_t len, struct umh_info *info) +{ + const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; + struct subprocess_info *sub_info; + char **argv = NULL; + struct file *file; + ssize_t written; + loff_t pos = 0; + int err; + + file = shmem_kernel_file_setup("", len, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + written = kernel_write(file, data, len, &pos); + if (written != len) { + err = written; + if (err >= 0) + err = -ENOMEM; + goto out; + } + + err = -ENOMEM; + argv = argv_split(GFP_KERNEL, cmdline, NULL); + if (!argv) + goto out; + + sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL, + umd_setup, umd_cleanup, info); + if (!sub_info) + goto out; + + sub_info->file = file; + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (!err) { + mutex_lock(&umh_list_lock); + list_add(&info->list, &umh_list); + mutex_unlock(&umh_list_lock); + } +out: + if (argv) + argv_free(argv); + fput(file); + return err; +} +EXPORT_SYMBOL_GPL(fork_usermode_blob); + +void __exit_umh(struct task_struct *tsk) +{ + struct umh_info *info; + pid_t pid = tsk->pid; + + mutex_lock(&umh_list_lock); + list_for_each_entry(info, &umh_list, list) { + if (info->pid == pid) { + list_del(&info->list); + mutex_unlock(&umh_list_lock); + goto out; + } + } + mutex_unlock(&umh_list_lock); + return; +out: + if (info->cleanup) + info->cleanup(info); +} + -- cgit v1.2.3 From 74be2d3b80af1bb264c3b9905b52c15efc03c0fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 26 Jun 2020 11:16:06 -0500 Subject: umd: For clarity rename umh_info umd_info This structure is only used for user mode drivers so change the prefix from umh to umd to make that clear. v1: https://lkml.kernel.org/r/87o8p6f0kw.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/878sg563po.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-6-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/usermode_driver.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 5b05863af855..e73550e946d6 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -11,7 +11,7 @@ static DEFINE_MUTEX(umh_list_lock); static int umd_setup(struct subprocess_info *info, struct cred *new) { - struct umh_info *umh_info = info->data; + struct umd_info *umd_info = info->data; struct file *from_umh[2]; struct file *to_umh[2]; int err; @@ -43,21 +43,21 @@ static int umd_setup(struct subprocess_info *info, struct cred *new) return err; } - umh_info->pipe_to_umh = to_umh[1]; - umh_info->pipe_from_umh = from_umh[0]; - umh_info->pid = task_pid_nr(current); + umd_info->pipe_to_umh = to_umh[1]; + umd_info->pipe_from_umh = from_umh[0]; + umd_info->pid = task_pid_nr(current); current->flags |= PF_UMH; return 0; } static void umd_cleanup(struct subprocess_info *info) { - struct umh_info *umh_info = info->data; + struct umd_info *umd_info = info->data; /* cleanup if umh_setup() was successful but exec failed */ if (info->retval) { - fput(umh_info->pipe_to_umh); - fput(umh_info->pipe_from_umh); + fput(umd_info->pipe_to_umh); + fput(umd_info->pipe_from_umh); } } @@ -72,12 +72,12 @@ static void umd_cleanup(struct subprocess_info *info) * * Returns either negative error or zero which indicates success * in executing a blob of bytes as a usermode process. In such - * case 'struct umh_info *info' is populated with two pipes + * case 'struct umd_info *info' is populated with two pipes * and a pid of the process. The caller is responsible for health * check of the user process, killing it via pid, and closing the * pipes when user process is no longer needed. */ -int fork_usermode_blob(void *data, size_t len, struct umh_info *info) +int fork_usermode_blob(void *data, size_t len, struct umd_info *info) { const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; struct subprocess_info *sub_info; @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob); void __exit_umh(struct task_struct *tsk) { - struct umh_info *info; + struct umd_info *info; pid_t pid = tsk->pid; mutex_lock(&umh_list_lock); -- cgit v1.2.3 From 1199c6c3da5197e9924a906b9de71b8d0ac62a01 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 11:38:08 -0500 Subject: umd: Rename umd_info.cmdline umd_info.driver_name The only thing supplied in the cmdline today is the driver name so rename the field to clarify the code. As this value is always supplied stop trying to handle the case of a NULL cmdline. Additionally since we now have a name we can count on use the driver_name any place where the code is looking for a name of the binary. v1: https://lkml.kernel.org/r/87imfef0k3.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87366d63os.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-7-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/usermode_driver.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index e73550e946d6..46d60d855e93 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -67,9 +67,6 @@ static void umd_cleanup(struct subprocess_info *info) * @len: length of the blob * @info: information about usermode process (shouldn't be NULL) * - * If info->cmdline is set it will be used as command line for the - * user process, else "usermodehelper" is used. - * * Returns either negative error or zero which indicates success * in executing a blob of bytes as a usermode process. In such * case 'struct umd_info *info' is populated with two pipes @@ -79,7 +76,6 @@ static void umd_cleanup(struct subprocess_info *info) */ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) { - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; struct subprocess_info *sub_info; char **argv = NULL; struct file *file; @@ -87,7 +83,7 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) loff_t pos = 0; int err; - file = shmem_kernel_file_setup("", len, 0); + file = shmem_kernel_file_setup(info->driver_name, len, 0); if (IS_ERR(file)) return PTR_ERR(file); @@ -100,11 +96,12 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) } err = -ENOMEM; - argv = argv_split(GFP_KERNEL, cmdline, NULL); + argv = argv_split(GFP_KERNEL, info->driver_name, NULL); if (!argv) goto out; - sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL, + sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL, + GFP_KERNEL, umd_setup, umd_cleanup, info); if (!sub_info) goto out; -- cgit v1.2.3 From e2dc9bf3f5275ca372001541e5f26af572976e65 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 13:12:59 -0500 Subject: umd: Transform fork_usermode_blob into fork_usermode_driver Instead of loading a binary blob into a temporary file with shmem_kernel_file_setup load a binary blob into a temporary tmpfs filesystem. This means that the blob can be stored in an init section and discared, and it means the binary blob will have a filename so can be executed normally. The only tricky thing about this code is that in the helper function blob_to_mnt __fput_sync is used. That is because a file can not be executed if it is still open for write, and the ordinary delayed close for kernel threads does not happen soon enough, which causes the following exec to fail. The function umd_load_blob is not called with any locks so this should be safe. Executing the blob normally winds up correcting several problems with the user mode driver code discovered by Tetsuo Handa[1]. By passing an ordinary filename into the exec, it is no longer necessary to figure out how to turn a O_RDWR file descriptor into a properly referende counted O_EXEC file descriptor that forbids all writes. For path based LSMs there are no new special cases. [1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/ v1: https://lkml.kernel.org/r/87d05mf0j9.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87wo3p4p35.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-8-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/usermode_driver.c | 126 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 46d60d855e93..a86798759f83 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -4,11 +4,98 @@ */ #include #include +#include +#include +#include #include static LIST_HEAD(umh_list); static DEFINE_MUTEX(umh_list_lock); +static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) +{ + struct file_system_type *type; + struct vfsmount *mnt; + struct file *file; + ssize_t written; + loff_t pos = 0; + + type = get_fs_type("tmpfs"); + if (!type) + return ERR_PTR(-ENODEV); + + mnt = kern_mount(type); + put_filesystem(type); + if (IS_ERR(mnt)) + return mnt; + + file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700); + if (IS_ERR(file)) { + mntput(mnt); + return ERR_CAST(file); + } + + written = kernel_write(file, data, len, &pos); + if (written != len) { + int err = written; + if (err >= 0) + err = -ENOMEM; + filp_close(file, NULL); + mntput(mnt); + return ERR_PTR(err); + } + + fput(file); + + /* Flush delayed fput so exec can open the file read-only */ + flush_delayed_fput(); + task_work_run(); + return mnt; +} + +/** + * umd_load_blob - Remember a blob of bytes for fork_usermode_driver + * @info: information about usermode driver + * @data: a blob of bytes that can be executed as a file + * @len: The lentgh of the blob + * + */ +int umd_load_blob(struct umd_info *info, const void *data, size_t len) +{ + struct vfsmount *mnt; + + if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) + return -EBUSY; + + mnt = blob_to_mnt(data, len, info->driver_name); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + info->wd.mnt = mnt; + info->wd.dentry = mnt->mnt_root; + return 0; +} +EXPORT_SYMBOL_GPL(umd_load_blob); + +/** + * umd_unload_blob - Disassociate @info from a previously loaded blob + * @info: information about usermode driver + * + */ +int umd_unload_blob(struct umd_info *info) +{ + if (WARN_ON_ONCE(!info->wd.mnt || + !info->wd.dentry || + info->wd.mnt->mnt_root != info->wd.dentry)) + return -EINVAL; + + kern_unmount(info->wd.mnt); + info->wd.mnt = NULL; + info->wd.dentry = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(umd_unload_blob); + static int umd_setup(struct subprocess_info *info, struct cred *new) { struct umd_info *umd_info = info->data; @@ -43,6 +130,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new) return err; } + set_fs_pwd(current->fs, &umd_info->wd); umd_info->pipe_to_umh = to_umh[1]; umd_info->pipe_from_umh = from_umh[0]; umd_info->pid = task_pid_nr(current); @@ -62,39 +150,21 @@ static void umd_cleanup(struct subprocess_info *info) } /** - * fork_usermode_blob - fork a blob of bytes as a usermode process - * @data: a blob of bytes that can be do_execv-ed as a file - * @len: length of the blob - * @info: information about usermode process (shouldn't be NULL) + * fork_usermode_driver - fork a usermode driver + * @info: information about usermode driver (shouldn't be NULL) * - * Returns either negative error or zero which indicates success - * in executing a blob of bytes as a usermode process. In such - * case 'struct umd_info *info' is populated with two pipes - * and a pid of the process. The caller is responsible for health - * check of the user process, killing it via pid, and closing the - * pipes when user process is no longer needed. + * Returns either negative error or zero which indicates success in + * executing a usermode driver. In such case 'struct umd_info *info' + * is populated with two pipes and a pid of the process. The caller is + * responsible for health check of the user process, killing it via + * pid, and closing the pipes when user process is no longer needed. */ -int fork_usermode_blob(void *data, size_t len, struct umd_info *info) +int fork_usermode_driver(struct umd_info *info) { struct subprocess_info *sub_info; char **argv = NULL; - struct file *file; - ssize_t written; - loff_t pos = 0; int err; - file = shmem_kernel_file_setup(info->driver_name, len, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - written = kernel_write(file, data, len, &pos); - if (written != len) { - err = written; - if (err >= 0) - err = -ENOMEM; - goto out; - } - err = -ENOMEM; argv = argv_split(GFP_KERNEL, info->driver_name, NULL); if (!argv) @@ -106,7 +176,6 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) if (!sub_info) goto out; - sub_info->file = file; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); if (!err) { mutex_lock(&umh_list_lock); @@ -116,10 +185,9 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) out: if (argv) argv_free(argv); - fput(file); return err; } -EXPORT_SYMBOL_GPL(fork_usermode_blob); +EXPORT_SYMBOL_GPL(fork_usermode_driver); void __exit_umh(struct task_struct *tsk) { -- cgit v1.2.3 From 55e6074e3fa67e1fb9ec140904db7e6cae6eda4b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 13:52:50 -0500 Subject: umh: Stop calling do_execve_file With the user mode driver code changed to not set subprocess_info.file there are no more users of subproces_info.file. Remove this field from struct subprocess_info and remove the only user in call_usermodehelper_exec_async that would call do_execve_file instead of do_execve if file was set. v1: https://lkml.kernel.org/r/877dvuf0i7.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87r1tx4p2a.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-9-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/umh.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 3e4e453d45c8..6ca2096298b9 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -98,13 +98,9 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - if (sub_info->file) - retval = do_execve_file(sub_info->file, - sub_info->argv, sub_info->envp); - else - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; /* -- cgit v1.2.3 From 1c340ead18ee4b4a84357abdef6d4f39ee08328b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 16:48:26 -0500 Subject: umd: Track user space drivers with struct pid Use struct pid instead of user space pid values that are prone to wrap araound. In addition track the entire thread group instead of just the first thread that is started by exec. There are no multi-threaded user mode drivers today but there is nothing preclucing user drivers from being multi-threaded, so it is just a good idea to track the entire process. Take a reference count on the tgid's in question to make it possible to remove exit_umh in a future change. As a struct pid is available directly use kill_pid_info. The prior process signalling code was iffy in using a userspace pid known to be in the initial pid namespace and then looking up it's task in whatever the current pid namespace is. It worked only because kernel threads always run in the initial pid namespace. As the tgid is now refcounted verify the tgid is NULL at the start of fork_usermode_driver to avoid the possibility of silent pid leaks. v1: https://lkml.kernel.org/r/87mu4qdlv2.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/a70l4oy8.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-12-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 3 ++- kernel/usermode_driver.c | 15 ++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a081deea52ca..d3294b611df1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -805,7 +805,8 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - exit_umh(tsk); + if (group_dead) + exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index a86798759f83..f77f8d7ce9e3 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -133,7 +133,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new) set_fs_pwd(current->fs, &umd_info->wd); umd_info->pipe_to_umh = to_umh[1]; umd_info->pipe_from_umh = from_umh[0]; - umd_info->pid = task_pid_nr(current); + umd_info->tgid = get_pid(task_tgid(current)); current->flags |= PF_UMH; return 0; } @@ -146,6 +146,8 @@ static void umd_cleanup(struct subprocess_info *info) if (info->retval) { fput(umd_info->pipe_to_umh); fput(umd_info->pipe_from_umh); + put_pid(umd_info->tgid); + umd_info->tgid = NULL; } } @@ -155,9 +157,9 @@ static void umd_cleanup(struct subprocess_info *info) * * Returns either negative error or zero which indicates success in * executing a usermode driver. In such case 'struct umd_info *info' - * is populated with two pipes and a pid of the process. The caller is + * is populated with two pipes and a tgid of the process. The caller is * responsible for health check of the user process, killing it via - * pid, and closing the pipes when user process is no longer needed. + * tgid, and closing the pipes when user process is no longer needed. */ int fork_usermode_driver(struct umd_info *info) { @@ -165,6 +167,9 @@ int fork_usermode_driver(struct umd_info *info) char **argv = NULL; int err; + if (WARN_ON_ONCE(info->tgid)) + return -EBUSY; + err = -ENOMEM; argv = argv_split(GFP_KERNEL, info->driver_name, NULL); if (!argv) @@ -192,11 +197,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_driver); void __exit_umh(struct task_struct *tsk) { struct umd_info *info; - pid_t pid = tsk->pid; + struct pid *tgid = task_tgid(tsk); mutex_lock(&umh_list_lock); list_for_each_entry(info, &umh_list, list) { - if (info->pid == pid) { + if (info->tgid == tgid) { list_del(&info->list); mutex_unlock(&umh_list_lock); goto out; -- cgit v1.2.3 From ff2a91127b374c75ae024b31d22f23ad49d16eb4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 24 May 2020 20:57:00 +0200 Subject: fork: remove do_fork() Now that all architectures have been switched to use _do_fork() and the new struct kernel_clone_args calling convention we can remove the legacy do_fork() helper completely. The calling convention used to be brittle and do_fork() didn't buy us anything. The only calling convention accepted should be based on struct kernel_clone_args going forward. It's cleaner and uniform. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Al Viro Cc: "Matthew Wilcox (Oracle)" Cc: "Peter Zijlstra (Intel)" Signed-off-by: Christian Brauner --- kernel/fork.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9875aeb2ba41..0fd7eb1b38f9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2493,29 +2493,6 @@ long _do_fork(struct kernel_clone_args *args) return nr; } -#ifndef CONFIG_HAVE_COPY_THREAD_TLS -/* For compatibility with architectures that call do_fork directly rather than - * using the syscall entry points below. */ -long do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr) -{ - struct kernel_clone_args args = { - .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), - .pidfd = parent_tidptr, - .child_tid = child_tidptr, - .parent_tid = parent_tidptr, - .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), - .stack = stack_start, - .stack_size = stack_size, - }; - - return _do_fork(&args); -} -#endif - /* * Create a kernel thread. */ @@ -2923,7 +2900,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by do_fork() cannot be used here directly + * functions used by _do_fork() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. -- cgit v1.2.3 From 140c8180eb7c7cbda399f64474788b86db72db32 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 24 May 2020 23:34:20 +0200 Subject: arch: remove HAVE_COPY_THREAD_TLS All architectures support copy_thread_tls() now, so remove the legacy copy_thread() function and the HAVE_COPY_THREAD_TLS config option. Everyone uses the same process creation calling convention based on copy_thread_tls() and struct kernel_clone_args. This will make it easier to maintain the core process creation code under kernel/, simplifies the callpaths and makes the identical for all architectures. Cc: linux-arch@vger.kernel.org Acked-by: Thomas Bogendoerfer Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven Reviewed-by: Kees Cook Signed-off-by: Christian Brauner --- kernel/fork.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0fd7eb1b38f9..8e52e16a1b5e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2577,15 +2577,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 -/* - * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from - * the registers containing the syscall arguments for clone. This doesn't work - * with clone3 since the TLS value is passed in clone_args instead. - */ -#ifndef CONFIG_HAVE_COPY_THREAD_TLS -#error clone3 requires copy_thread_tls support in arch -#endif - noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) -- cgit v1.2.3 From 714acdbd1c94e7e3ab90f6b6938f1ccb27b662f0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 11 Jun 2020 11:04:15 +0200 Subject: arch: rename copy_thread_tls() back to copy_thread() Now that HAVE_COPY_THREAD_TLS has been removed, rename copy_thread_tls() back simply copy_thread(). It's a simpler name, and doesn't imply that only tls is copied here. This finishes an outstanding chunk of internal process creation work since we've added clone3(). Cc: linux-arch@vger.kernel.org Acked-by: Thomas Bogendoerfer A Acked-by: Stafford Horne Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven A Reviewed-by: Kees Cook Signed-off-by: Christian Brauner --- kernel/fork.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8e52e16a1b5e..790841eb0a21 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2104,8 +2104,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, - args->tls); + retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); if (retval) goto bad_fork_cleanup_io; -- cgit v1.2.3 From 38fd525a4c61e7ecdc9ad4dcbf7b767d0a007962 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Jul 2020 07:30:06 -0500 Subject: exit: Factor thread_group_exited out of pidfd_poll Create an independent helper thread_group_exited which returns true when all threads have passed exit_notify in do_exit. AKA all of the threads are at least zombies and might be dead or completely gone. Create this helper by taking the logic out of pidfd_poll where it is already tested, and adding a READ_ONCE on the read of task->exit_state. I will be changing the user mode driver code to use this same logic to know when a user mode driver needs to be restarted. Place the new helper thread_group_exited in kernel/exit.c and EXPORT it so it can be used by modules. Link: https://lkml.kernel.org/r/20200702164140.4468-13-ebiederm@xmission.com Acked-by: Christian Brauner Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 24 ++++++++++++++++++++++++ kernel/fork.c | 6 +----- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d3294b611df1..dee246c0866f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1713,6 +1713,30 @@ Efault: } #endif +/** + * thread_group_exited - check that a thread group has exited + * @pid: tgid of thread group to be checked. + * + * Test if the thread group represented by tgid has exited (all + * threads are zombies, dead or completely gone). + * + * Return: true if the thread group has exited. false otherwise. + */ +bool thread_group_exited(struct pid *pid) +{ + struct task_struct *task; + bool exited; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + exited = !task || + (READ_ONCE(task->exit_state) && thread_group_empty(task)); + rcu_read_unlock(); + + return exited; +} +EXPORT_SYMBOL(thread_group_exited); + __weak void abort(void) { BUG(); diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..bf215af7a904 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1787,22 +1787,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { - struct task_struct *task; struct pid *pid = file->private_data; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); - rcu_read_lock(); - task = pid_task(pid, PIDTYPE_PID); /* * Inform pollers only when the whole thread group exits. * If the thread group leader exits before all other threads in the * group, then poll(2) should block, similar to the wait(2) family. */ - if (!task || (task->exit_state && thread_group_empty(task))) + if (thread_group_exited(pid)) poll_flags = EPOLLIN | EPOLLRDNORM; - rcu_read_unlock(); return poll_flags; } -- cgit v1.2.3 From 8c2f52663973e643c617663d826e2b0daa008b38 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 17:40:40 -0500 Subject: umd: Remove exit_umh The bpfilter code no longer uses the umd_info.cleanup callback. This callback is what exit_umh exists to call. So remove exit_umh and all of it's associated booking. v1: https://lkml.kernel.org/r/87bll6dlte.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87y2o53abg.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-15-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 3 --- kernel/usermode_driver.c | 28 ---------------------------- 2 files changed, 31 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index dee246c0866f..39226a018ed7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include @@ -805,8 +804,6 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - if (group_dead) - exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index f77f8d7ce9e3..cd136f86f799 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -9,9 +9,6 @@ #include #include -static LIST_HEAD(umh_list); -static DEFINE_MUTEX(umh_list_lock); - static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) { struct file_system_type *type; @@ -134,7 +131,6 @@ static int umd_setup(struct subprocess_info *info, struct cred *new) umd_info->pipe_to_umh = to_umh[1]; umd_info->pipe_from_umh = from_umh[0]; umd_info->tgid = get_pid(task_tgid(current)); - current->flags |= PF_UMH; return 0; } @@ -182,11 +178,6 @@ int fork_usermode_driver(struct umd_info *info) goto out; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - if (!err) { - mutex_lock(&umh_list_lock); - list_add(&info->list, &umh_list); - mutex_unlock(&umh_list_lock); - } out: if (argv) argv_free(argv); @@ -194,23 +185,4 @@ out: } EXPORT_SYMBOL_GPL(fork_usermode_driver); -void __exit_umh(struct task_struct *tsk) -{ - struct umd_info *info; - struct pid *tgid = task_tgid(tsk); - - mutex_lock(&umh_list_lock); - list_for_each_entry(info, &umh_list, list) { - if (info->tgid == tgid) { - list_del(&info->list); - mutex_unlock(&umh_list_lock); - goto out; - } - } - mutex_unlock(&umh_list_lock); - return; -out: - if (info->cleanup) - info->cleanup(info); -} -- cgit v1.2.3 From 33c326014fe69304244868cc793c2c77be533125 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 29 Jun 2020 08:28:33 -0500 Subject: umd: Stop using split_argv There is exactly one argument so there is nothing to split. All split_argv does now is cause confusion and avoid the need for a cast when passing a "const char *" string to call_usermodehelper_setup. So avoid confusion and the possibility of an odd driver name causing problems by just using a fixed argv array with a cast in the call to call_usermodehelper_setup. v1: https://lkml.kernel.org/r/87sged3a9n.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-16-ebiederm@xmission.com Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/usermode_driver.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index cd136f86f799..0b35212ffc3d 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -160,27 +160,21 @@ static void umd_cleanup(struct subprocess_info *info) int fork_usermode_driver(struct umd_info *info) { struct subprocess_info *sub_info; - char **argv = NULL; + const char *argv[] = { info->driver_name, NULL }; int err; if (WARN_ON_ONCE(info->tgid)) return -EBUSY; err = -ENOMEM; - argv = argv_split(GFP_KERNEL, info->driver_name, NULL); - if (!argv) - goto out; - - sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL, - GFP_KERNEL, + sub_info = call_usermodehelper_setup(info->driver_name, + (char **)argv, NULL, GFP_KERNEL, umd_setup, umd_cleanup, info); if (!sub_info) goto out; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); out: - if (argv) - argv_free(argv); return err; } EXPORT_SYMBOL_GPL(fork_usermode_driver); -- cgit v1.2.3 From ad0f75e5f57ccbceec13274e1e242f2b5a6397ed Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Jul 2020 11:52:56 -0700 Subject: cgroup: fix cgroup_sk_alloc() for sk_clone_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we clone a socket in sk_clone_lock(), its sk_cgrp_data is copied, so the cgroup refcnt must be taken too. And, unlike the sk_alloc() path, sock_update_netprioidx() is not called here. Therefore, it is safe and necessary to grab the cgroup refcnt even when cgroup_sk_alloc is disabled. sk_clone_lock() is in BH context anyway, the in_interrupt() would terminate this function if called there. And for sk_alloc() skcd->val is always zero. So it's safe to factor out the code to make it more readable. The global variable 'cgroup_sk_alloc_disabled' is used to determine whether to take these reference counts. It is impossible to make the reference counting correct unless we save this bit of information in skcd->val. So, add a new bit there to record whether the socket has already taken the reference counts. This obviously relies on kmalloc() to align cgroup pointers to at least 4 bytes, ARCH_KMALLOC_MINALIGN is certainly larger than that. This bug seems to be introduced since the beginning, commit d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") tried to fix it but not compeletely. It seems not easy to trigger until the recent commit 090e28b229af ("netprio_cgroup: Fix unlimited memory leak of v2 cgroups") was merged. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Cameron Berkenpas Reported-by: Peter Geis Reported-by: Lu Fengqi Reported-by: Daniël Sonck Reported-by: Zhang Qiang Tested-by: Cameron Berkenpas Tested-by: Peter Geis Tested-by: Thomas Lamprecht Cc: Daniel Borkmann Cc: Zefan Li Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- kernel/cgroup/cgroup.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1ea181a58465..dd247747ec14 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6439,18 +6439,8 @@ void cgroup_sk_alloc_disable(void) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) - return; - - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; return; } @@ -6475,10 +6465,27 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) rcu_read_unlock(); } +void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + if (skcd->val) { + if (skcd->no_refcnt) + return; + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); + } +} + void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); + if (skcd->no_refcnt) + return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } -- cgit v1.2.3 From f5836749c9c04a10decd2742845ad4870965fdef Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 6 Jul 2020 16:01:25 -0700 Subject: bpf: Add BPF_CGROUP_INET_SOCK_RELEASE hook Sometimes it's handy to know when the socket gets freed. In particular, we'd like to try to use a smarter allocation of ports for bpf_bind and explore the possibility of limiting the number of SOCK_DGRAM sockets the process can have. Implement BPF_CGROUP_INET_SOCK_RELEASE hook that triggers on inet socket release. It triggers only for userspace sockets (not in-kernel ones) and therefore has the same semantics as the existing BPF_CGROUP_INET_SOCK_CREATE. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200706230128.4073544-2-sdf@google.com --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..156f51ffada2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1981,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; @@ -2779,6 +2780,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; @@ -2929,6 +2931,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: -- cgit v1.2.3 From 42815808f1791eb53c3a05fb78c0a8642ecf8467 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 6 Jul 2020 17:49:09 +0200 Subject: timens: make vdso_join_timens() always succeed As discussed on-list (cf. [1]), in order to make setns() support time namespaces when attaching to multiple namespaces at once properly we need to tweak vdso_join_timens() to always succeed. So switch vdso_join_timens() to using a read lock and replacing mmap_write_lock_killable() to mmap_read_lock() as we discussed. Last cycle setns() was changed to support attaching to multiple namespaces atomically. This requires all namespaces to have a point of no return where they can't fail anymore. Specifically, _install() is allowed to perform permission checks and install the namespace into the new struct nsset that it has been given but it is not allowed to make visible changes to the affected task. Once _install() returns anything that the given namespace type requires to be setup in addition needs to ideally be done in a function that can't fail or if it fails the failure is not fatal. For time namespaces the relevant functions that fall into this category are timens_set_vvar_page() and vdso_join_timens(). Currently the latter can fail but doesn't need to. With this we can go on to implement a timens_commit() helper in a follow up patch to be used by setns(). [1]: https://lore.kernel.org/lkml/20200611110221.pgd3r5qkjrjmfqa2@wittgenstein Signed-off-by: Christian Brauner Reviewed-by: Andrei Vagin Cc: Will Deacon Cc: Vincenzo Frascino Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Mark Rutland Cc: Dmitry Safonov Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200706154912.3248030-2-christian.brauner@ubuntu.com --- kernel/time/namespace.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 5d9fc22d836a..e5af6fe87af8 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -284,7 +284,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); - int err; if (!current_is_single_threaded()) return -EUSERS; @@ -295,9 +294,7 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) timens_set_vvar_page(current, ns); - err = vdso_join_timens(current, ns); - if (err) - return err; + vdso_join_timens(current, ns); get_time_ns(ns); put_time_ns(nsproxy->time_ns); @@ -313,7 +310,6 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); - int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) @@ -321,9 +317,7 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) timens_set_vvar_page(tsk, ns); - err = vdso_join_timens(tsk, ns); - if (err) - return err; + vdso_join_timens(tsk, ns); get_time_ns(ns); put_time_ns(nsproxy->time_ns); -- cgit v1.2.3 From 5cfea9a106bbb1af919ab94456fdd02209984070 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 6 Jul 2020 17:49:10 +0200 Subject: timens: add timens_commit() helper Wrap the calls to timens_set_vvar_page() and vdso_join_timens() in timens_on_fork() and timens_install() in a new timens_commit() helper. We'll use this helper in a follow-up patch in nsproxy too. Signed-off-by: Christian Brauner Reviewed-by: Andrei Vagin Cc: Will Deacon Cc: Vincenzo Frascino Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Mark Rutland Cc: Dmitry Safonov Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200706154912.3248030-3-christian.brauner@ubuntu.com --- kernel/time/namespace.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e5af6fe87af8..aa7b90aac2a7 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,6 +280,12 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } +static void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -292,9 +298,8 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - timens_set_vvar_page(current, ns); - vdso_join_timens(current, ns); + timens_commit(current, ns); get_time_ns(ns); put_time_ns(nsproxy->time_ns); @@ -315,14 +320,12 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; - timens_set_vvar_page(tsk, ns); - - vdso_join_timens(tsk, ns); - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; + timens_commit(tsk, ns); + return 0; } -- cgit v1.2.3 From 76c12881a38aaa83e1eb4ce2fada36c3a732bad4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 6 Jul 2020 17:49:11 +0200 Subject: nsproxy: support CLONE_NEWTIME with setns() So far setns() was missing time namespace support. This was partially due to it simply not being implemented but also because vdso_join_timens() could still fail which made switching to multiple namespaces atomically problematic. This is now fixed so support CLONE_NEWTIME with setns() Signed-off-by: Christian Brauner Reviewed-by: Andrei Vagin Cc: Thomas Gleixner Cc: Michael Kerrisk Cc: Serge Hallyn Cc: Dmitry Safonov Link: https://lore.kernel.org/r/20200706154912.3248030-4-christian.brauner@ubuntu.com --- kernel/nsproxy.c | 21 +++++++++++++++++++-- kernel/time/namespace.c | 5 +---- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index cd356630a311..12dd41b39a7f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p) static int check_setns_flags(unsigned long flags) { if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | - CLONE_NEWCGROUP))) + CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER | + CLONE_NEWPID | CLONE_NEWCGROUP))) return -EINVAL; #ifndef CONFIG_USER_NS @@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags) if (flags & CLONE_NEWNET) return -EINVAL; #endif +#ifndef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) + return -EINVAL; +#endif return 0; } @@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid) } #endif +#ifdef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) { + ret = validate_ns(nsset, &nsp->time_ns->ns); + if (ret) + goto out; + } +#endif + out: if (pid_ns) put_pid_ns(pid_ns); @@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset) exit_sem(me); #endif +#ifdef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) + timens_commit(me, nsset->nsproxy->time_ns); +#endif + /* transfer ownership */ switch_task_namespaces(me, nsset->nsproxy); nsset->nsproxy = NULL; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index aa7b90aac2a7..afc65e6be33e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,7 +280,7 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -static void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { timens_set_vvar_page(tsk, ns); vdso_join_timens(tsk, ns); @@ -298,9 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - - timens_commit(current, ns); - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; -- cgit v1.2.3 From dbfb089d360b1cc623c51a2c7cf9b99eff78e0e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jul 2020 12:40:33 +0200 Subject: sched: Fix loadavg accounting race The recent commit: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") moved these lines in ttwu(): p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; up before: smp_cond_load_acquire(&p->on_cpu, !VAL); into the 'p->on_rq == 0' block, with the thinking that once we hit schedule() the current task cannot change it's ->state anymore. And while this is true, it is both incorrect and flawed. It is incorrect in that we need at least an ACQUIRE on 'p->on_rq == 0' to avoid weak hardware from re-ordering things for us. This can fairly easily be achieved by relying on the control-dependency already in place. The second problem, which makes the flaw in the original argument, is that while schedule() will not change prev->state, it will read it a number of times (arguably too many times since it's marked volatile). The previous condition 'p->on_cpu == 0' was sufficient because that indicates schedule() has completed, and will no longer read prev->state. So now the trick is to make this same true for the (much) earlier 'prev->on_rq == 0' case. Furthermore, in order to make the ordering stick, the 'prev->on_rq = 0' assignment needs to he a RELEASE, but adding additional ordering to schedule() is an unwelcome proposition at the best of times, doubly so for mere accounting. Luckily we can push the prev->state load up before rq->lock, with the only caveat that we then have to re-read the state after. However, we know that if it changed, we no longer have to worry about the blocking path. This gives us the required ordering, if we block, we did the prev->state load before an (effective) smp_mb() and the p->on_rq store needs not change. With this we end up with the effective ordering: LOAD p->state LOAD-ACQUIRE p->on_rq == 0 MB STORE p->on_rq, 0 STORE p->state, TASK_WAKING which ensures the TASK_WAKING store happens after the prev->state load, and all is well again. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Dave Jones Reported-by: Paul Gortmaker Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dave Jones Tested-by: Paul Gortmaker Link: https://lkml.kernel.org/r/20200707102957.GN117543@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 67 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca5db40392d4..950ac45d5480 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #ifdef CONFIG_SMP - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -4097,6 +4100,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4113,12 +4117,22 @@ static void __sched notrace __schedule(bool preempt) local_irq_disable(); rcu_note_context_switch(preempt); + /* See deactivate_task() below. */ + prev_state = prev->state; + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4129,10 +4143,31 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + /* + * We must re-load prev->state in case ttwu_remote() changed it + * before we acquired rq->lock. + */ + if (!preempt && prev_state && prev_state == prev->state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...) + * LOCK rq->lock goto out; + * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); + * p->on_rq = 0; p->state = TASK_WAKING; + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { -- cgit v1.2.3 From ce3614daabea8a2d01c1dd17ae41d1ec5e5ae7db Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 6 Jul 2020 16:49:10 -0400 Subject: sched: Fix unreliable rseq cpu_id for new tasks While integrating rseq into glibc and replacing glibc's sched_getcpu implementation with rseq, glibc's tests discovered an issue with incorrect __rseq_abi.cpu_id field value right after the first time a newly created process issues sched_setaffinity. For the records, it triggers after building glibc and running tests, and then issuing: for x in {1..2000} ; do posix/tst-affinity-static & done and shows up as: error: Unexpected CPU 2, expected 0 error: Unexpected CPU 2, expected 0 error: Unexpected CPU 2, expected 0 error: Unexpected CPU 2, expected 0 error: Unexpected CPU 138, expected 0 error: Unexpected CPU 138, expected 0 error: Unexpected CPU 138, expected 0 error: Unexpected CPU 138, expected 0 This is caused by the scheduler invoking __set_task_cpu() directly from sched_fork() and wake_up_new_task(), thus bypassing rseq_migrate() which is done by set_task_cpu(). Add the missing rseq_migrate() to both functions. The only other direct use of __set_task_cpu() is done by init_idle(), which does not involve a user-space task. Based on my testing with the glibc test-case, just adding rseq_migrate() to wake_up_new_task() is sufficient to fix the observed issue. Also add it to sched_fork() to keep things consistent. The reason why this never triggered so far with the rseq/basic_test selftest is unclear. The current use of sched_getcpu(3) does not typically require it to be always accurate. However, use of the __rseq_abi.cpu_id field within rseq critical sections requires it to be accurate. If it is not accurate, it can cause corruption in the per-cpu data targeted by rseq critical sections in user-space. Reported-By: Florian Weimer Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Tested-By: Florian Weimer Cc: stable@vger.kernel.org # v4.18+ Link: https://lkml.kernel.org/r/20200707201505.2632-1-mathieu.desnoyers@efficios.com --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 950ac45d5480..e15543cb8481 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2965,6 +2965,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); + rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -3029,6 +3030,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); + rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); -- cgit v1.2.3 From ff9ff926889dd8026b4ba55266a010c27f68604f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:21 -0700 Subject: perf/core: Factor out functions to allocate/free the task_ctx_data The method to allocate/free the task_ctx_data is going to be changed in the following patch. Currently, the task_ctx_data is allocated/freed in several different places. To avoid repeatedly modifying the same codes in several different places, alloc_task_ctx_data() and free_task_ctx_data() are factored out to allocate/free the task_ctx_data. The modification only needs to be applied once. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.liang@linux.intel.com --- kernel/events/core.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9b8f92500833..75090403f942 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1238,12 +1238,22 @@ static void get_ctx(struct perf_event_context *ctx) refcount_inc(&ctx->refcount); } +static void *alloc_task_ctx_data(struct pmu *pmu) +{ + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); +} + +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +{ + kfree(task_ctx_data); +} + static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx->task_ctx_data); + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -4471,7 +4481,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, goto errout; if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { err = -ENOMEM; goto errout; @@ -4529,11 +4539,11 @@ retry: } } - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } @@ -12497,8 +12507,7 @@ inherit_event(struct perf_event *parent_event, !child_ctx->task_ctx_data) { struct pmu *pmu = child_event->pmu; - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, - GFP_KERNEL); + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); if (!child_ctx->task_ctx_data) { free_event(child_event); return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 217c2a633ebb36f1cc6d249f4ef2e4a809d46818 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:22 -0700 Subject: perf/core: Use kmem_cache to allocate the PMU specific data Currently, the PMU specific data task_ctx_data is allocated by the function kzalloc() in the perf generic code. When there is no specific alignment requirement for the task_ctx_data, the method works well for now. However, there will be a problem once a specific alignment requirement is introduced in future features, e.g., the Architecture LBR XSAVE feature requires 64-byte alignment. If the specific alignment requirement is not fulfilled, the XSAVE family of instructions will fail to save/restore the xstate to/from the task_ctx_data. The function kzalloc() itself only guarantees a natural alignment. A new method to allocate the task_ctx_data has to be introduced, which has to meet the requirements as below: - must be a generic method can be used by different architectures, because the allocation of the task_ctx_data is implemented in the perf generic code; - must be an alignment-guarantee method (The alignment requirement is not changed after the boot); - must be able to allocate/free a buffer (smaller than a page size) dynamically; - should not cause extra CPU overhead or space overhead. Several options were considered as below: - One option is to allocate a larger buffer for task_ctx_data. E.g., ptr = kmalloc(size + alignment, GFP_KERNEL); ptr &= ~(alignment - 1); This option causes space overhead. - Another option is to allocate the task_ctx_data in the PMU specific code. To do so, several function pointers have to be added. As a result, both the generic structure and the PMU specific structure will become bigger. Besides, extra function calls are added when allocating/freeing the buffer. This option will increase both the space overhead and CPU overhead. - The third option is to use a kmem_cache to allocate a buffer for the task_ctx_data. The kmem_cache can be created with a specific alignment requirement by the PMU at boot time. A new pointer for kmem_cache has to be added in the generic struct pmu, which would be used to dynamically allocate a buffer for the task_ctx_data at run time. Although the new pointer is added to the struct pmu, the existing variable task_ctx_size is not required anymore. The size of the generic structure is kept the same. The third option which meets all the aforementioned requirements is used to replace kzalloc() for the PMU specific data allocation. A later patch will remove the kzalloc() method and the related variables. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.liang@linux.intel.com --- kernel/events/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 75090403f942..30d9b3182369 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1240,12 +1240,18 @@ static void get_ctx(struct perf_event_context *ctx) static void *alloc_task_ctx_data(struct pmu *pmu) { + if (pmu->task_ctx_cache) + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { - kfree(task_ctx_data); + if (pmu->task_ctx_cache && task_ctx_data) + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + else + kfree(task_ctx_data); } static void free_ctx(struct rcu_head *head) -- cgit v1.2.3 From 5a09928d339f3cf0973991ddc3a2798825c84c99 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:24 -0700 Subject: perf/x86: Remove task_ctx_size A new kmem_cache method has replaced the kzalloc() to allocate the PMU specific data. The task_ctx_size is not required anymore. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-19-git-send-email-kan.liang@linux.intel.com --- kernel/events/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 30d9b3182369..7c436d705fbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1243,15 +1243,13 @@ static void *alloc_task_ctx_data(struct pmu *pmu) if (pmu->task_ctx_cache) return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); - return kzalloc(pmu->task_ctx_size, GFP_KERNEL); + return NULL; } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { if (pmu->task_ctx_cache && task_ctx_data) kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); - else - kfree(task_ctx_data); } static void free_ctx(struct rcu_head *head) -- cgit v1.2.3 From 85c2ce9104eb93517db2037699471c517e81f9b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Jun 2020 16:49:05 +0200 Subject: sched, vmlinux.lds: Increase STRUCT_ALIGNMENT to 64 bytes for GCC-4.9 For some mysterious reason GCC-4.9 has a 64 byte section alignment for structures, all other GCC versions (and Clang) tested (including 4.8 and 5.0) are fine with the 32 bytes alignment. Getting this right is important for the new SCHED_DATA macro that creates an explicitly ordered array of 'struct sched_class' in the linker script and expect pointer arithmetic to work. Fixes: c3a340f7e7ea ("sched: Have sched_class_highest define by vmlinux.lds.h") Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200630144905.GX4817@hirez.programming.kicks-ass.net --- kernel/sched/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5aa6661ecaf1..9bef2dd01247 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -67,6 +67,7 @@ #include #include +#include #ifdef CONFIG_PARAVIRT # include @@ -1810,7 +1811,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -} __aligned(32); /* STRUCT_ALIGN(), vmlinux.lds.h */ +} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { -- cgit v1.2.3 From d81ae8aac85ca2e307d273f6dc7863a721bf054e Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 30 Jun 2020 12:21:22 +0100 Subject: sched/uclamp: Fix initialization of struct uclamp_rq struct uclamp_rq was zeroed out entirely in assumption that in the first call to uclamp_rq_inc() they'd be initialized correctly in accordance to default settings. But when next patch introduces a static key to skip uclamp_rq_{inc,dec}() until userspace opts in to use uclamp, schedutil will fail to perform any frequency changes because the rq->uclamp[UCLAMP_MAX].value is zeroed at init and stays as such. Which means all rqs are capped to 0 by default. Fix it by making sure we do proper initialization at init without relying on uclamp_rq_inc() doing it later. Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Lukasz Luba Link: https://lkml.kernel.org/r/20200630112123.12076-2-qais.yousef@arm.com --- kernel/sched/core.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15c980af63db..9605db70e671 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1239,6 +1239,20 @@ static void uclamp_fork(struct task_struct *p) } } +static void __init init_uclamp_rq(struct rq *rq) +{ + enum uclamp_id clamp_id; + struct uclamp_rq *uc_rq = rq->uclamp; + + for_each_clamp_id(clamp_id) { + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; + } + + rq->uclamp_flags = 0; +} + static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; @@ -1247,11 +1261,8 @@ static void __init init_uclamp(void) mutex_init(&uclamp_mutex); - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], -- cgit v1.2.3 From 46609ce227039fd192e0ecc7d940bed587fd2c78 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 30 Jun 2020 12:21:23 +0100 Subject: sched/uclamp: Protect uclamp fast path code with static key There is a report that when uclamp is enabled, a netperf UDP test regresses compared to a kernel compiled without uclamp. https://lore.kernel.org/lkml/20200529100806.GA3070@suse.de/ While investigating the root cause, there were no sign that the uclamp code is doing anything particularly expensive but could suffer from bad cache behavior under certain circumstances that are yet to be understood. https://lore.kernel.org/lkml/20200616110824.dgkkbyapn3io6wik@e107158-lin/ To reduce the pressure on the fast path anyway, add a static key that is by default will skip executing uclamp logic in the enqueue/dequeue_task() fast path until it's needed. As soon as the user start using util clamp by: 1. Changing uclamp value of a task with sched_setattr() 2. Modifying the default sysctl_sched_util_clamp_{min, max} 3. Modifying the default cpu.uclamp.{min, max} value in cgroup We flip the static key now that the user has opted to use util clamp. Effectively re-introducing uclamp logic in the enqueue/dequeue_task() fast path. It stays on from that point forward until the next reboot. This should help minimize the effect of util clamp on workloads that don't need it but still allow distros to ship their kernels with uclamp compiled in by default. SCHED_WARN_ON() in uclamp_rq_dec_id() was removed since now we can end up with unbalanced call to uclamp_rq_dec_id() if we flip the key while a task is running in the rq. Since we know it is harmless we just quietly return if we attempt a uclamp_rq_dec_id() when rq->uclamp[].bucket[].tasks is 0. In schedutil, we introduce a new uclamp_is_enabled() helper which takes the static key into account to ensure RT boosting behavior is retained. The following results demonstrates how this helps on 2 Sockets Xeon E5 2x10-Cores system. nouclamp uclamp uclamp-static-key Hmean send-64 162.43 ( 0.00%) 157.84 * -2.82%* 163.39 * 0.59%* Hmean send-128 324.71 ( 0.00%) 314.78 * -3.06%* 326.18 * 0.45%* Hmean send-256 641.55 ( 0.00%) 628.67 * -2.01%* 648.12 * 1.02%* Hmean send-1024 2525.28 ( 0.00%) 2448.26 * -3.05%* 2543.73 * 0.73%* Hmean send-2048 4836.14 ( 0.00%) 4712.08 * -2.57%* 4867.69 * 0.65%* Hmean send-3312 7540.83 ( 0.00%) 7425.45 * -1.53%* 7621.06 * 1.06%* Hmean send-4096 9124.53 ( 0.00%) 8948.82 * -1.93%* 9276.25 * 1.66%* Hmean send-8192 15589.67 ( 0.00%) 15486.35 * -0.66%* 15819.98 * 1.48%* Hmean send-16384 26386.47 ( 0.00%) 25752.25 * -2.40%* 26773.74 * 1.47%* The perf diff between nouclamp and uclamp-static-key when uclamp is disabled in the fast path: 8.73% -1.55% [kernel.kallsyms] [k] try_to_wake_up 0.07% +0.04% [kernel.kallsyms] [k] deactivate_task 0.13% -0.02% [kernel.kallsyms] [k] activate_task The diff between nouclamp and uclamp-static-key when uclamp is enabled in the fast path: 8.73% -0.72% [kernel.kallsyms] [k] try_to_wake_up 0.13% +0.39% [kernel.kallsyms] [k] activate_task 0.07% +0.38% [kernel.kallsyms] [k] deactivate_task Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting") Reported-by: Mel Gorman Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Lukasz Luba Link: https://lkml.kernel.org/r/20200630112123.12076-3-qais.yousef@arm.com --- kernel/sched/core.c | 74 +++++++++++++++++++++++++++++++++++++++- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/sched.h | 47 +++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9605db70e671..4cf30e4de653 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -796,6 +796,26 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); + /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -992,10 +1012,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqeueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bukcet->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -1023,6 +1071,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1038,6 +1095,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1146,8 +1212,10 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } /* * We update all RUNNABLE tasks only when task groups are in use. @@ -1212,6 +1280,8 @@ static void __setscheduler_uclamp(struct task_struct *p, if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; + static_branch_enable(&sched_uclamp_used); + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); @@ -7436,6 +7506,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; + static_branch_enable(&sched_uclamp_used); + mutex_lock(&uclamp_mutex); rcu_read_lock(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..dc6835bc6490 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -210,7 +210,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); - if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && + if (!uclamp_is_used() && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9bef2dd01247..b1432f608061 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -878,6 +878,8 @@ struct uclamp_rq { unsigned int value; struct uclamp_bucket bucket[UCLAMP_BUCKETS]; }; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ /* @@ -2365,12 +2367,35 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ static __always_inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, struct task_struct *p) { - unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2387,6 +2412,19 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, @@ -2394,6 +2432,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, { return util; } + +static inline bool uclamp_is_used(void) +{ + return false; +} #endif /* CONFIG_UCLAMP_TASK */ #ifdef arch_scale_freq_capacity -- cgit v1.2.3 From 9d246053a69196c7c27068870e9b4b66ac536f68 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Mon, 29 Jun 2020 15:23:03 -0400 Subject: sched: Add a tracepoint to track rq->nr_running Add a bare tracepoint trace_sched_update_nr_running_tp which tracks ->nr_running CPU's rq. This is used to accurately trace this data and provide a visualization of scheduler imbalances in, for example, the form of a heat map. The tracepoint is accessed by loading an external kernel module. An example module (forked from Qais' module and including the pelt related tracepoints) can be found at: https://github.com/auldp/tracepoints-helpers.git A script to turn the trace-cmd report output into a heatmap plot can be found at: https://github.com/jirvoz/plot-nr-running The tracepoints are added to add_nr_running() and sub_nr_running() which are in kernel/sched/sched.h. In order to avoid CREATE_TRACE_POINTS in the header a wrapper call is used and the trace/events/sched.h include is moved before sched.h in kernel/sched/core. Signed-off-by: Phil Auld Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200629192303.GC120228@lorien.usersys.redhat.com --- kernel/sched/core.c | 13 +++++++++---- kernel/sched/fair.c | 8 ++++++-- kernel/sched/pelt.c | 2 -- kernel/sched/sched.h | 10 ++++++++++ 4 files changed, 25 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4cf30e4de653..ff0519551188 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,6 +6,10 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + #include "sched.h" #include @@ -23,9 +27,6 @@ #include "pelt.h" #include "smp.h" -#define CREATE_TRACE_POINTS -#include - /* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them. @@ -38,6 +39,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -8195,4 +8197,7 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#undef CREATE_TRACE_POINTS +void call_trace_sched_update_nr_running(struct rq *rq, int count) +{ + trace_sched_update_nr_running_tp(rq, count); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6fab1d17c575..3213cb247aff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -22,8 +22,6 @@ */ #include "sched.h" -#include - /* * Targeted preemption latency for CPU-bound tasks: * @@ -11296,3 +11294,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd) #endif } EXPORT_SYMBOL_GPL(sched_trace_rd_span); + +int sched_trace_rq_nr_running(struct rq *rq) +{ + return rq ? rq->nr_running : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 11bea3b08115..2c613e1cff3a 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -28,8 +28,6 @@ #include "sched.h" #include "pelt.h" -#include - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1432f608061..65b72e0487bf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -76,6 +76,8 @@ #include "cpupri.h" #include "cpudeadline.h" +#include + #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else @@ -97,6 +99,7 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +extern void call_trace_sched_update_nr_running(struct rq *rq, int count); /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -1973,6 +1976,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count) unsigned prev_nr = rq->nr_running; rq->nr_running = prev_nr + count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { @@ -1987,6 +1993,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count) static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } + /* Check if we still need preemption */ sched_update_tick_dependency(rq); } -- cgit v1.2.3 From 05eee619ed61c8cd89633954d38c4e5653086845 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 23 Oct 2019 19:16:22 +0800 Subject: x86/kvm: Add "nopvspin" parameter to disable PV spinlocks There are cases where a guest tries to switch spinlocks to bare metal behavior (e.g. by setting "xen_nopvspin" on XEN platform and "hv_nopvspin" on HYPER_V). That feature is missed on KVM, add a new parameter "nopvspin" to disable PV spinlocks for KVM guest. The new 'nopvspin' parameter will also replace Xen and Hyper-V specific parameters in future patches. Define variable nopvsin as global because it will be used in future patches as above. Signed-off-by: Zhenzhong Duan Reviewed-by: Vitaly Kuznetsov Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Paolo Bonzini --- kernel/locking/qspinlock.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b9515fcc9b29..cbff6ba53d56 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin __initdata; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif -- cgit v1.2.3 From 160251842cd35a75edfb0a1d76afa3eb674ff40a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 11:49:23 -0700 Subject: kallsyms: Refactor kallsyms_show_value() to take cred In order to perform future tests against the cred saved during open(), switch kallsyms_show_value() to operate on a cred, and have all current callers pass current_cred(). This makes it very obvious where callers are checking the wrong credential in their "read" contexts. These will be fixed in the coming patches. Additionally switch return value to bool, since it is always used as a direct permission check, not a 0-on-success, negative-on-error style function return. Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/kallsyms.c | 17 +++++++++++------ kernel/kprobes.c | 4 ++-- kernel/module.c | 2 +- 3 files changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..bb14e64f62a4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -644,19 +644,20 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -int kallsyms_show_value(void) +bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) - return 1; + return true; /* fallthrough */ case 1: - if (has_capability_noaudit(current, CAP_SYSLOG)) - return 1; + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; /* fallthrough */ default: - return 0; + return false; } } @@ -673,7 +674,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); - iter->show_value = kallsyms_show_value(); + /* + * Instead of checking this on every s_show() call, cache + * the result here at open time. + */ + iter->show_value = kallsyms_show_value(file->f_cred); return 0; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..d4de217e4a91 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2448,7 +2448,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value()) + if (!kallsyms_show_value(current_cred())) addr = NULL; if (sym) @@ -2540,7 +2540,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value()) + if (!kallsyms_show_value(current_cred())) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..a5022ae84e50 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4377,7 +4377,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value() ? NULL : (void *)8ul; + m->private = kallsyms_show_value(current_cred()) ? NULL : (void *)8ul; } return err; -- cgit v1.2.3 From ed66f991bb19d94cae5d38f77de81f96aac7813f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 13:47:20 -0700 Subject: module: Refactor section attr into bin attribute In order to gain access to the open file's f_cred for kallsym visibility permission checks, refactor the module section attributes to use the bin_attribute instead of attribute interface. Additionally removes the redundant "name" struct member. Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Tested-by: Jessica Yu Acked-by: Jessica Yu Signed-off-by: Kees Cook --- kernel/module.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a5022ae84e50..9e2954519259 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1510,8 +1510,7 @@ static inline bool sect_empty(const Elf_Shdr *sect) } struct module_sect_attr { - struct module_attribute mattr; - char *name; + struct bin_attribute battr; unsigned long address; }; @@ -1521,11 +1520,16 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; -static ssize_t module_sect_show(struct module_attribute *mattr, - struct module_kobject *mk, char *buf) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = - container_of(mattr, struct module_sect_attr, mattr); + container_of(battr, struct module_sect_attr, battr); + + if (pos != 0) + return -EINVAL; + return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? (void *)sattr->address : NULL); } @@ -1535,7 +1539,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) unsigned int section; for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs->attrs[section].battr.attr.name); kfree(sect_attrs); } @@ -1544,42 +1548,41 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; - struct attribute **gattr; + struct bin_attribute **gattr; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); if (sect_attrs == NULL) return; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { Elf_Shdr *sec = &info->sechdrs[i]; if (sect_empty(sec)) continue; + sysfs_bin_attr_init(&sattr->battr); sattr->address = sec->sh_addr; - sattr->name = kstrdup(info->secstrings + sec->sh_name, - GFP_KERNEL); - if (sattr->name == NULL) + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (sattr->battr.attr.name == NULL) goto out; sect_attrs->nsections++; - sysfs_attr_init(&sattr->mattr.attr); - sattr->mattr.show = module_sect_show; - sattr->mattr.store = NULL; - sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUSR; - *(gattr++) = &(sattr++)->mattr.attr; + sattr->battr.read = module_sect_read; + sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; } *gattr = NULL; @@ -1669,7 +1672,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) continue; if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].name; + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; nattr->attr.mode = S_IRUGO; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *) info->sechdrs[i].sh_addr; -- cgit v1.2.3 From b25a7c5af9051850d4f3d93ca500056ab6ec724b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 14:43:59 -0700 Subject: module: Do not expose section addresses to non-CAP_SYSLOG The printing of section addresses in /sys/module/*/sections/* was not using the correct credentials to evaluate visibility. Before: # cat /sys/module/*/sections/.*text 0xffffffffc0458000 ... # capsh --drop=CAP_SYSLOG -- -c "cat /sys/module/*/sections/.*text" 0xffffffffc0458000 ... After: # cat /sys/module/*/sections/*.text 0xffffffffc0458000 ... # capsh --drop=CAP_SYSLOG -- -c "cat /sys/module/*/sections/.*text" 0x0000000000000000 ... Additionally replaces the existing (safe) /proc/modules check with file->f_cred for consistency. Reported-by: Dominik Czarnota Fixes: be71eda5383f ("module: Fix display of wrong module .text address") Cc: stable@vger.kernel.org Tested-by: Jessica Yu Acked-by: Jessica Yu Signed-off-by: Kees Cook --- kernel/module.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9e2954519259..e6c7571092cb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1530,8 +1530,8 @@ static ssize_t module_sect_read(struct file *file, struct kobject *kobj, if (pos != 0) return -EINVAL; - return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? - (void *)sattr->address : NULL); + return sprintf(buf, "0x%px\n", + kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -4380,7 +4380,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value(current_cred()) ? NULL : (void *)8ul; + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; } return err; -- cgit v1.2.3 From 60f7bb66b88b649433bf700acfc60c3f24953871 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 15:20:22 -0700 Subject: kprobes: Do not expose probe addresses to non-CAP_SYSLOG The kprobe show() functions were using "current"'s creds instead of the file opener's creds for kallsyms visibility. Fix to use seq_file->file->f_cred. Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 81365a947de4 ("kprobes: Show address of kprobes if kallsyms does") Fixes: ffb9bd68ebdb ("kprobes: Show blacklist addresses as same as kallsyms does") Signed-off-by: Kees Cook --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d4de217e4a91..2e97febeef77 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2448,7 +2448,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value(current_cred())) + if (!kallsyms_show_value(pi->file->f_cred)) addr = NULL; if (sym) @@ -2540,7 +2540,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value(current_cred())) + if (!kallsyms_show_value(m->file->f_cred)) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else -- cgit v1.2.3 From 63960260457a02af2a6cb35d75e6bdb17299c882 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 15:45:23 -0700 Subject: bpf: Check correct cred for CAP_SYSLOG in bpf_dump_raw_ok() When evaluating access control over kallsyms visibility, credentials at open() time need to be used, not the "current" creds (though in BPF's case, this has likely always been the same). Plumb access to associated file->f_cred down through bpf_dump_raw_ok() and its callers now that kallsysm_show_value() has been refactored to take struct cred. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: bpf@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Signed-off-by: Kees Cook --- kernel/bpf/syscall.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..859053ddf05b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3139,7 +3139,8 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, return NULL; } -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, + const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; @@ -3165,7 +3166,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) code == (BPF_JMP | BPF_CALL_ARGS)) { if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) + if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } @@ -3221,7 +3222,8 @@ static int set_info_rec_size(struct bpf_prog_info *info) return 0; } -static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, +static int bpf_prog_get_info_by_fd(struct file *file, + struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3290,11 +3292,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { + if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog); + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); @@ -3328,7 +3330,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); @@ -3363,7 +3365,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; @@ -3394,7 +3396,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; @@ -3451,7 +3453,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { __u64 __user *user_linfo; u32 i; @@ -3497,7 +3499,8 @@ done: return 0; } -static int bpf_map_get_info_by_fd(struct bpf_map *map, +static int bpf_map_get_info_by_fd(struct file *file, + struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3540,7 +3543,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } -static int bpf_btf_get_info_by_fd(struct btf *btf, +static int bpf_btf_get_info_by_fd(struct file *file, + struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3555,7 +3559,8 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } -static int bpf_link_get_info_by_fd(struct bpf_link *link, +static int bpf_link_get_info_by_fd(struct file *file, + struct bpf_link *link, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3608,15 +3613,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EBADFD; if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file->private_data, attr, + err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_link_fops) - err = bpf_link_get_info_by_fd(f.file->private_data, + err = bpf_link_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else err = -EINVAL; -- cgit v1.2.3 From d7481b24b816b8c3955a9eaf01b97e2bd7f61a37 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 3 Jul 2020 12:56:19 -0400 Subject: audit: issue CWD record to accompany LSM_AUDIT_DATA_* records The LSM_AUDIT_DATA_* records for PATH, FILE, IOCTL_OP, DENTRY and INODE are incomplete without the task context of the AUDIT Current Working Directory record. Add it. This record addition can't use audit_dummy_context to determine whether or not to store the record information since the LSM_AUDIT_DATA_* records are initiated by various LSMs independent of any audit rules. context->in_syscall is used to determine if it was called in user context like audit_getname. Please see the upstream issue https://github.com/linux-audit/audit-kernel/issues/96 Adapted from Vladis Dronov's v2 patch. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index eae1a599ffe3..6884b50069d1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1891,6 +1891,20 @@ __audit_reusename(const __user char *uptr) return NULL; } +inline void _audit_getcwd(struct audit_context *context) +{ + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); +} + +void __audit_getcwd(void) +{ + struct audit_context *context = audit_context(); + + if (context->in_syscall) + _audit_getcwd(context); +} + /** * __audit_getname - add a name to the list * @name: name to add @@ -1915,8 +1929,7 @@ void __audit_getname(struct filename *name) name->aname = n; name->refcnt++; - if (!context->pwd.dentry) - get_fs_pwd(current->fs, &context->pwd); + _audit_getcwd(context); } static inline int audit_copy_fcaps(struct audit_names *name, -- cgit v1.2.3 From 30c66fc30ee7a98c4f3adf5fb7e213b61884474f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Jul 2020 03:06:57 +0200 Subject: timer: Prevent base->clk from moving backward When a timer is enqueued with a negative delta (ie: expiry is below base->clk), it gets added to the wheel as expiring now (base->clk). Yet the value that gets stored in base->next_expiry, while calling trigger_dyntick_cpu(), is the initial timer->expires value. The resulting state becomes: base->next_expiry < base->clk On the next timer enqueue, forward_timer_base() may accidentally rewind base->clk. As a possible outcome, timers may expire way too early, the worst case being that the highest wheel levels get spuriously processed again. To prevent from that, make sure that base->next_expiry doesn't get below base->clk. Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Anna-Maria Behnsen Tested-by: Juri Lelli Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200703010657.2302-1-frederic@kernel.org --- kernel/time/timer.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 398e6eadb861..9a838d38dbe6 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -584,7 +584,15 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * Set the next expiry time and kick the CPU so it can reevaluate the * wheel: */ - base->next_expiry = timer->expires; + if (time_before(timer->expires, base->clk)) { + /* + * Prevent from forward_timer_base() moving the base->clk + * backward + */ + base->next_expiry = base->clk; + } else { + base->next_expiry = timer->expires; + } wake_up_nohz_cpu(base->cpu); } @@ -896,10 +904,13 @@ static inline void forward_timer_base(struct timer_base *base) * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) + if (time_after(base->next_expiry, jnow)) { base->clk = jnow; - else + } else { + if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) + return; base->clk = base->next_expiry; + } #endif } -- cgit v1.2.3 From f3dda7a679df183e798b86e7b6ec05ab35476de3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jul 2020 23:11:04 -0700 Subject: bpf: net: Avoid copying sk_user_data of reuseport_array during sk_clone It makes little sense for copying sk_user_data of reuseport_array during sk_clone_lock(). This patch reuses the SK_USER_DATA_NOCOPY bit introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). It is used to mark the sk_user_data is not supposed to be copied to its clone. Although the cloned sk's sk_user_data will not be used/freed in bpf_sk_reuseport_detach(), this change can still allow the cloned sk's sk_user_data to be used by some other means. Freeing the reuseport_array's sk_user_data does not require a rcu grace period. Thus, the existing rcu_assign_sk_user_data_nocopy() is not used. Fixes: 5dc4c4b7d4e8 ("bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200709061104.4018798-1-kafai@fb.com --- kernel/bpf/reuseport_array.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 21cde24386db..a95bc8d7e812 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -20,11 +20,14 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map) /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { - struct sock __rcu **socks; + uintptr_t sk_user_data; write_lock_bh(&sk->sk_callback_lock); - socks = sk->sk_user_data; - if (socks) { + sk_user_data = (uintptr_t)sk->sk_user_data; + if (sk_user_data) { + struct sock __rcu **socks; + + socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of @@ -252,6 +255,7 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; + uintptr_t sk_user_data; struct socket *socket; int err, fd; @@ -305,7 +309,8 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY; + WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; -- cgit v1.2.3 From c9a368f1c0fbe2e3a21ebf231caeae58b18b2681 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jul 2020 23:11:10 -0700 Subject: bpf: net: Avoid incorrect bpf_sk_reuseport_detach call bpf_sk_reuseport_detach is currently called when sk->sk_user_data is not NULL. It is incorrect because sk->sk_user_data may not be managed by the bpf's reuseport_array. It has been reported in [1] that, the bpf_sk_reuseport_detach() which is called from udp_lib_unhash() has corrupted the sk_user_data managed by l2tp. This patch solves it by using another bit (defined as SK_USER_DATA_BPF) of the sk_user_data pointer value. It marks that a sk_user_data is managed/owned by BPF. The patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). [ Note: sk->sk_user_data is used by bpf's reuseport_array only when a sk is added to the bpf's reuseport_array. i.e. doing setsockopt(SO_REUSEPORT) and having "sk->sk_reuseport == 1" alone will not stop sk->sk_user_data being used by other means. ] [1]: https://lore.kernel.org/netdev/20200706121259.GA20199@katalix.com/ Fixes: 5dc4c4b7d4e8 ("bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY") Reported-by: James Chapman Reported-by: syzbot+9f092552ba9a5efca5df@syzkaller.appspotmail.com Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: James Chapman Acked-by: James Chapman Link: https://lore.kernel.org/bpf/20200709061110.4019316-1-kafai@fb.com --- kernel/bpf/reuseport_array.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a95bc8d7e812..cae9d505e04a 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -24,7 +24,7 @@ void bpf_sk_reuseport_detach(struct sock *sk) write_lock_bh(&sk->sk_callback_lock); sk_user_data = (uintptr_t)sk->sk_user_data; - if (sk_user_data) { + if (sk_user_data & SK_USER_DATA_BPF) { struct sock __rcu **socks; socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); @@ -309,7 +309,8 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY; + sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | + SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; -- cgit v1.2.3 From 746cf3459f118592a72ef42e7551777ff17b1684 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 3 Jul 2020 10:06:09 +0800 Subject: tracing: Simplify defining of the next event id The value to be used and compared in trace_search_list() is "last + 1". Let's just define next to be "last + 1" instead of doing the addition each time. Link: https://lkml.kernel.org/r/20200703020612.12930-2-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 73976de7f8cc..a35232d61601 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -675,11 +675,11 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { struct trace_event *e; - int last = __TRACE_LAST_TYPE; + int next = __TRACE_LAST_TYPE + 1; if (list_empty(&ftrace_event_list)) { *list = &ftrace_event_list; - return last + 1; + return next; } /* @@ -687,17 +687,17 @@ static int trace_search_list(struct list_head **list) * lets see if somebody freed one. */ list_for_each_entry(e, &ftrace_event_list, list) { - if (e->type != last + 1) + if (e->type != next) break; - last++; + next++; } /* Did we used up all 65 thousand events??? */ - if ((last + 1) > TRACE_EVENT_TYPE_MAX) + if (next > TRACE_EVENT_TYPE_MAX) return 0; *list = &e->list; - return last + 1; + return next; } void trace_event_read_lock(void) -- cgit v1.2.3 From 36b8aacf2a483ba40fbe91c830d314e0bc133044 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 3 Jul 2020 10:06:10 +0800 Subject: tracing: Save one trace_event->type by using __TRACE_LAST_TYPE Static defined trace_event->type stops at (__TRACE_LAST_TYPE - 1) and dynamic trace_event->type starts from (__TRACE_LAST_TYPE + 1). To save one trace_event->type index, let's use __TRACE_LAST_TYPE. Link: https://lkml.kernel.org/r/20200703020612.12930-3-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a35232d61601..4d1893564912 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,7 +20,7 @@ DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; -static int next_event_type = __TRACE_LAST_TYPE + 1; +static int next_event_type = __TRACE_LAST_TYPE; enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { @@ -675,7 +675,7 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { struct trace_event *e; - int next = __TRACE_LAST_TYPE + 1; + int next = __TRACE_LAST_TYPE; if (list_empty(&ftrace_event_list)) { *list = &ftrace_event_list; -- cgit v1.2.3 From 8c080d3a974ad471d8324825851044284f1886c9 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jun 2020 13:36:42 +0800 Subject: kgdb: enable arch to support XML packet. The XML packet could be supported by required architecture if the architecture defines CONFIG_HAVE_ARCH_KGDB_QXFER_PKT and implement its own kgdb_arch_handle_qxfer_pkt(). Except for the kgdb_arch_handle_qxfer_pkt(), the architecture also needs to record the feature supported by gdb stub into the kgdb_arch_gdb_stub_feature, and these features will be reported to host gdb when gdb stub receives the qSupported packet. Signed-off-by: Vincent Chen Acked-by: Daniel Thompson Signed-off-by: Palmer Dabbelt --- kernel/debug/gdbstub.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 61774aec46b4..a790026e42d0 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -792,6 +792,19 @@ static void gdb_cmd_query(struct kgdb_state *ks) } break; #endif +#ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT + case 'S': + if (!strncmp(remcom_in_buffer, "qSupported:", 11)) + strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); + break; + case 'X': + if (!strncmp(remcom_in_buffer, "qXfer:", 6)) + kgdb_arch_handle_qxfer_pkt(remcom_in_buffer, + remcom_out_buffer); + break; +#endif + default: + break; } } -- cgit v1.2.3 From 248591f5d257a19c1cba9ab9da3536bfbc2f0149 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 24 Jun 2020 13:32:46 +0200 Subject: kcsan: Make KCSAN compatible with new IRQ state tracking The new IRQ state tracking code does not honor lockdep_off(), and as such we should again permit tracing by using non-raw functions in core.c. Update the lockdep_off() comment in report.c, to reflect the fact there is still a potential risk of deadlock due to using printk() from scheduler code. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200624113246.GA170324@elver.google.com --- kernel/kcsan/core.c | 5 ++--- kernel/kcsan/report.c | 9 +++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 15f67949d11e..732623c30359 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -397,8 +397,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) } if (!kcsan_interrupt_watcher) - /* Use raw to avoid lockdep recursion via IRQ flags tracing. */ - raw_local_irq_save(irq_flags); + local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -539,7 +538,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: if (!kcsan_interrupt_watcher) - raw_local_irq_restore(irq_flags); + local_irq_restore(irq_flags); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ac5f8345bae9..6b2fb1a6d8cd 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -606,10 +606,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, goto out; /* - * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if - * we do not turn off lockdep here; this could happen due to recursion - * into lockdep via KCSAN if we detect a race in utilities used by - * lockdep. + * Because we may generate reports when we're in scheduler code, the use + * of printk() could deadlock. Until such time that all printing code + * called in print_report() is scheduler-safe, accept the risk, and just + * get our message out. As such, also disable lockdep to hide the + * warning, and avoid disabling lockdep for the rest of the kernel. */ lockdep_off(); -- cgit v1.2.3 From 859d069ee1ddd87862e1d6a356a82ed417dbeb67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 15:00:57 +0200 Subject: lockdep: Prepare for NMI IRQ state tracking There is no reason not to always, accurately, track IRQ state. This change also makes IRQ state tracking ignore lockdep_off(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.155449112@infradead.org --- kernel/locking/lockdep.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..d595623c4b34 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -395,7 +395,7 @@ void lockdep_init_task(struct task_struct *task) static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE(--current->lockdep_recursion)) + if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) current->lockdep_recursion = 0; } @@ -3646,7 +3646,16 @@ static void __trace_hardirqs_on_caller(void) */ void lockdep_hardirqs_on_prepare(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs do not (and cannot) track lock dependencies, nothing to do. + */ + if (unlikely(in_nmi())) + return; + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; if (unlikely(current->hardirqs_enabled)) { @@ -3692,7 +3701,27 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs can happen in the middle of local_irq_{en,dis}able() where the + * tracking state and hardware state are out of sync. + * + * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from, + * and not rely on hardware state like normal interrupts. + */ + if (unlikely(in_nmi())) { + /* + * Skip: + * - recursion check, because NMI can hit lockdep; + * - hardware state check, because above; + * - chain_key check, see lockdep_hardirqs_on_prepare(). + */ + goto skip_checks; + } + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; if (curr->hardirqs_enabled) { @@ -3720,6 +3749,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != current->curr_chain_key); +skip_checks: /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; curr->hardirq_enable_ip = ip; @@ -3735,7 +3765,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep; + * they will restore the software state. This ensures the software + * state is consistent inside NMIs as well. + */ + if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK))) return; /* -- cgit v1.2.3 From a21ee6055c30ce68c4e201c6496f0ed2a1936230 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2020 12:22:41 +0200 Subject: lockdep: Change hardirq{s_enabled,_context} to per-cpu variables Currently all IRQ-tracking state is in task_struct, this means that task_struct needs to be defined before we use it. Especially for lockdep_assert_irq*() this can lead to header-hell. Move the hardirq state into per-cpu variables to avoid the task_struct dependency. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.512673481@infradead.org --- kernel/fork.c | 4 +--- kernel/locking/lockdep.c | 30 +++++++++++++++--------------- kernel/softirq.c | 6 ++++++ 3 files changed, 22 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index efc5493203ae..70d9d0a4de2a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1954,8 +1954,8 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; @@ -2036,7 +2036,6 @@ static __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; - p->hardirqs_enabled = 0; p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; @@ -2046,7 +2045,6 @@ static __latent_entropy struct task_struct *copy_process( p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; - p->hardirq_context = 0; p->softirq_context = 0; #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d595623c4b34..ab4ffbe0e9e9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(curr), curr->softirqs_enabled); print_lock(next); @@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (unlikely(current->hardirqs_enabled)) { + if (unlikely(lockdep_hardirqs_enabled(current))) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current))) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled(curr)) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3751,7 +3751,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; + this_cpu_write(hardirqs_enabled, 1); curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_on_events); @@ -3783,11 +3783,11 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled(curr)) { /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; + this_cpu_write(hardirqs_enabled, 0); curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); @@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip) * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) + if (lockdep_hardirqs_enabled(curr)) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context(curr)) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context(curr)) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3928,7 +3928,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (curr->hardirq_context) { + if (lockdep_hardirq_context(curr)) { /* * Check if force_irqthreads will run us threaded. */ @@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) { printk("possible reason: unannotated irqs-on.\n"); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index c4201b7f42b1..342c53feaa7a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS + +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; -- cgit v1.2.3 From f9ad4a5f3f20bee022b1bdde94e5ece6dc0b0edc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 13:03:26 +0200 Subject: lockdep: Remove lockdep_hardirq{s_enabled,_context}() argument Now that the macros use per-cpu data, we no longer need the argument. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.571835311@infradead.org --- kernel/locking/lockdep.c | 30 +++++++++++++++--------------- kernel/softirq.c | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ab4ffbe0e9e9..c9ea05edce25 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); @@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), lockdep_softirqs_enabled(curr)); print_lock(this); @@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (unlikely(lockdep_hardirqs_enabled(current))) { + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current))) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (lockdep_hardirqs_enabled(curr)) { + if (lockdep_hardirqs_enabled()) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3783,7 +3783,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (lockdep_hardirqs_enabled(curr)) { + if (lockdep_hardirqs_enabled()) { /* * We have done an ON -> OFF transition: */ @@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip) * usage bit for all held locks, if hardirqs are * enabled too: */ - if (lockdep_hardirqs_enabled(curr)) + if (lockdep_hardirqs_enabled()) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (lockdep_hardirq_context(curr)) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (lockdep_hardirq_context(curr)) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3928,7 +3928,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (lockdep_hardirq_context(curr)) { + if (lockdep_hardirq_context()) { /* * Check if force_irqthreads will run us threaded. */ @@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index 342c53feaa7a..5e9aaa648a74 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -230,7 +230,7 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (lockdep_hardirq_context(current)) { + if (lockdep_hardirq_context()) { in_hardirq = true; lockdep_hardirq_exit(); } -- cgit v1.2.3 From c818c03b661cd769e035e41673d5543ba2ebda64 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 May 2020 14:11:26 -0700 Subject: seccomp: Report number of loaded filters in /proc/$pid/status A common question asked when debugging seccomp filters is "how many filters are attached to your process?" Provide a way to easily answer this question through /proc/$pid/status with a "Seccomp_filters" line. Signed-off-by: Kees Cook --- kernel/seccomp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d653d8426de9..f387e5004c29 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -398,6 +398,8 @@ static inline void seccomp_sync_threads(unsigned long flags) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + atomic_set(&thread->seccomp.filter_count, + atomic_read(&thread->seccomp.filter_count)); /* * Don't let an unprivileged task work around @@ -544,6 +546,7 @@ static long seccomp_attach_filter(unsigned int flags, */ filter->prev = current->seccomp.filter; current->seccomp.filter = filter; + atomic_inc(¤t->seccomp.filter_count); /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) -- cgit v1.2.3 From 9f87dcf14b82b05ff0e26970439b372ae135de0c Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Mon, 1 Jun 2020 04:25:32 -0700 Subject: seccomp: Add find_notification helper This adds a helper which can iterate through a seccomp_filter to find a notification matching an ID. It removes several replicated chunks of code. Signed-off-by: Sargun Dhillon Acked-by: Christian Brauner Reviewed-by: Tycho Andersen Cc: Matt Denton Cc: Kees Cook , Cc: Jann Horn , Cc: Robert Sesek , Cc: Chris Palmer Cc: Christian Brauner Cc: Tycho Andersen Link: https://lore.kernel.org/r/20200601112532.150158-1-sargun@sargun.me Signed-off-by: Kees Cook --- kernel/seccomp.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f387e5004c29..e711d697afb6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -41,6 +41,7 @@ #include #include #include +#include enum notify_state { SECCOMP_NOTIFY_INIT, @@ -1024,6 +1025,23 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) return 0; } +/* must be called with notif_lock held */ +static inline struct seccomp_knotif * +find_notification(struct seccomp_filter *filter, u64 id) +{ + struct seccomp_knotif *cur; + + lockdep_assert_held(&filter->notify_lock); + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == id) + return cur; + } + + return NULL; +} + + static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) { @@ -1081,15 +1099,8 @@ out: * may have died when we released the lock, so we need to make * sure it's still around. */ - knotif = NULL; mutex_lock(&filter->notify_lock); - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == unotif.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, unotif.id); if (knotif) { knotif->state = SECCOMP_NOTIFY_INIT; up(&filter->notif->request); @@ -1104,7 +1115,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, void __user *buf) { struct seccomp_notif_resp resp = {}; - struct seccomp_knotif *knotif = NULL, *cur; + struct seccomp_knotif *knotif; long ret; if (copy_from_user(&resp, buf, sizeof(resp))) @@ -1121,13 +1132,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (ret < 0) return ret; - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == resp.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, resp.id); if (!knotif) { ret = -ENOENT; goto out; @@ -1153,7 +1158,7 @@ out: static long seccomp_notify_id_valid(struct seccomp_filter *filter, void __user *buf) { - struct seccomp_knotif *knotif = NULL; + struct seccomp_knotif *knotif; u64 id; long ret; @@ -1164,16 +1169,12 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, if (ret < 0) return ret; - ret = -ENOENT; - list_for_each_entry(knotif, &filter->notif->notifications, list) { - if (knotif->id == id) { - if (knotif->state == SECCOMP_NOTIFY_SENT) - ret = 0; - goto out; - } - } + knotif = find_notification(filter, id); + if (knotif && knotif->state == SECCOMP_NOTIFY_SENT) + ret = 0; + else + ret = -ENOENT; -out: mutex_unlock(&filter->notify_lock); return ret; } -- cgit v1.2.3 From b707ddee11d1dc4518ab7f1aa5e7af9ceaa23317 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:28 +0200 Subject: seccomp: rename "usage" to "refs" and document Naming the lifetime counter of a seccomp filter "usage" suggests a little too strongly that its about tasks that are using this filter while it also tracks other references such as the user notifier or ptrace. This also updates the documentation to note this fact. We'll be introducing an actual usage counter in a follow-up patch. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-1-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e711d697afb6..d4dd3344e312 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -107,10 +107,11 @@ struct notification { /** * struct seccomp_filter - container for seccomp BPF programs * - * @usage: reference count to manage the object lifetime. - * get/put helpers should be used when accessing an instance - * outside of a lifetime-guarded section. In general, this - * is only needed for handling filters shared across tasks. + * @refs: Reference count to manage the object lifetime. + * A filter's reference count is incremented for each directly + * attached task, once for the dependent filter, and if + * requested for the user notifier. When @refs reaches zero, + * the filter can be freed. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -125,10 +126,10 @@ struct notification { * how namespaces work. * * seccomp_filter objects should never be modified after being attached - * to a task_struct (other than @usage). + * to a task_struct (other than @refs). */ struct seccomp_filter { - refcount_t usage; + refcount_t refs; bool log; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -464,7 +465,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(ret); } - refcount_set(&sfilter->usage, 1); + refcount_set(&sfilter->refs, 1); return sfilter; } @@ -558,7 +559,7 @@ static long seccomp_attach_filter(unsigned int flags, static void __get_seccomp_filter(struct seccomp_filter *filter) { - refcount_inc(&filter->usage); + refcount_inc(&filter->refs); } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -581,7 +582,7 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) static void __put_seccomp_filter(struct seccomp_filter *orig) { /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->usage)) { + while (orig && refcount_dec_and_test(&orig->refs)) { struct seccomp_filter *freeme = orig; orig = orig->prev; seccomp_filter_free(freeme); -- cgit v1.2.3 From 3a15fb6ed92cb32b0a83f406aa4a96f28c9adbc3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:29 +0200 Subject: seccomp: release filter after task is fully dead The seccomp filter used to be released in free_task() which is called asynchronously via call_rcu() and assorted mechanisms. Since we need to inform tasks waiting on the seccomp notifier when a filter goes empty we will notify them as soon as a task has been marked fully dead in release_task(). To not split seccomp cleanup into two parts, move filter release out of free_task() and into release_task() after we've unhashed struct task from struct pid, exited signals, and unlinked it from the threadgroups' thread list. We'll put the empty filter notification infrastructure into it in a follow up patch. This also renames put_seccomp_filter() to seccomp_filter_release() which is a more descriptive name of what we're doing here especially once we've added the empty filter notification mechanism in there. We're also NULL-ing the task's filter tree entrypoint which seems cleaner than leaving a dangling pointer in there. Note that this shouldn't need any memory barriers since we're calling this when the task is in release_task() which means it's EXIT_DEAD. So it can't modify its seccomp filters anymore. You can also see this from the point where we're calling seccomp_filter_release(). It's after __exit_signal() and at this point, tsk->sighand will already have been NULLed which is required for thread-sync and filter installation alike. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-2-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/exit.c | 1 + kernel/fork.c | 1 - kernel/seccomp.c | 62 +++++++++++++++++++++++++++++++++----------------------- 3 files changed, 38 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 727150f28103..00d77e5ba700 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -217,6 +217,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..c51a9cd824c5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -473,7 +473,6 @@ void free_task(struct task_struct *tsk) #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); - put_seccomp_filter(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d4dd3344e312..0ca6d5243427 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -368,6 +368,42 @@ static inline pid_t seccomp_can_sync_threads(void) return 0; } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_destroy(filter->prog); + kfree(filter); + } +} + +static void __put_seccomp_filter(struct seccomp_filter *orig) +{ + /* Clean up single-reference branches iteratively. */ + while (orig && refcount_dec_and_test(&orig->refs)) { + struct seccomp_filter *freeme = orig; + orig = orig->prev; + seccomp_filter_free(freeme); + } +} + +/** + * seccomp_filter_release - Detach the task from its filter tree + * and drop its reference count during + * exit. + * + * This function should only be called when the task is exiting as + * it detaches it from its filter tree. As such, READ_ONCE() and + * barriers are not needed here, as would normally be needed. + */ +void seccomp_filter_release(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + + /* Detach task from its filter tree. */ + tsk->seccomp.filter = NULL; + __put_seccomp_filter(orig); +} + /** * seccomp_sync_threads: sets all threads to use current's filter * @@ -397,7 +433,7 @@ static inline void seccomp_sync_threads(unsigned long flags) * current's path will hold a reference. (This also * allows a put before the assignment.) */ - put_seccomp_filter(thread); + __put_seccomp_filter(thread->seccomp.filter); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, @@ -571,30 +607,6 @@ void get_seccomp_filter(struct task_struct *tsk) __get_seccomp_filter(orig); } -static inline void seccomp_filter_free(struct seccomp_filter *filter) -{ - if (filter) { - bpf_prog_destroy(filter->prog); - kfree(filter); - } -} - -static void __put_seccomp_filter(struct seccomp_filter *orig) -{ - /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->refs)) { - struct seccomp_filter *freeme = orig; - orig = orig->prev; - seccomp_filter_free(freeme); - } -} - -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) -{ - __put_seccomp_filter(tsk->seccomp.filter); -} - static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) { clear_siginfo(info); -- cgit v1.2.3 From 76194c4e830d570d9e369d637bb907591d2b3111 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 1 Jun 2020 11:50:07 -0700 Subject: seccomp: Lift wait_queue into struct seccomp_filter Lift the wait_queue from struct notification into struct seccomp_filter. This is cleaner overall and lets us avoid having to take the notifier mutex in the future for EPOLLHUP notifications since we need to neither read nor modify the notifier specific aspects of the seccomp filter. In the exit path I'd very much like to avoid having to take the notifier mutex for each filter in the task's filter hierarchy. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Signed-off-by: Kees Cook --- kernel/seccomp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ca6d5243427..39c1c687d8ad 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -95,13 +95,11 @@ struct seccomp_knotif { * filter->notify_lock. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. - * @wqh: A wait queue for poll. */ struct notification { struct semaphore request; u64 next_id; struct list_head notifications; - wait_queue_head_t wqh; }; /** @@ -117,6 +115,7 @@ struct notification { * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information * @notify_lock: A lock for all notification-related accesses. + * @wqh: A wait queue for poll if a notifier is in use. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -135,6 +134,7 @@ struct seccomp_filter { struct bpf_prog *prog; struct notification *notif; struct mutex notify_lock; + wait_queue_head_t wqh; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -502,6 +502,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) } refcount_set(&sfilter->refs, 1); + init_waitqueue_head(&sfilter->wqh); return sfilter; } @@ -774,7 +775,7 @@ static int seccomp_do_user_notification(int this_syscall, list_add(&n.list, &match->notif->notifications); up(&match->notif->request); - wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM); + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); mutex_unlock(&match->notify_lock); /* @@ -1098,7 +1099,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, unotif.data = *(knotif->data); knotif->state = SECCOMP_NOTIFY_SENT; - wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM); + wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM); ret = 0; out: mutex_unlock(&filter->notify_lock); @@ -1217,7 +1218,7 @@ static __poll_t seccomp_notify_poll(struct file *file, __poll_t ret = 0; struct seccomp_knotif *cur; - poll_wait(file, &filter->notif->wqh, poll_tab); + poll_wait(file, &filter->wqh, poll_tab); if (mutex_lock_interruptible(&filter->notify_lock) < 0) return EPOLLERR; @@ -1261,7 +1262,6 @@ static struct file *init_listener(struct seccomp_filter *filter) sema_init(&filter->notif->request, 0); filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); - init_waitqueue_head(&filter->notif->wqh); ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, filter, O_RDWR); -- cgit v1.2.3 From 99cdb8b9a57393b5978e7a6310a2cba511dd179b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:30 +0200 Subject: seccomp: notify about unused filter We've been making heavy use of the seccomp notifier to intercept and handle certain syscalls for containers. This patch allows a syscall supervisor listening on a given notifier to be notified when a seccomp filter has become unused. A container is often managed by a singleton supervisor process the so-called "monitor". This monitor process has an event loop which has various event handlers registered. If the user specified a seccomp profile that included a notifier for various syscalls then we also register a seccomp notify even handler. For any container using a separate pid namespace the lifecycle of the seccomp notifier is bound to the init process of the pid namespace, i.e. when the init process exits the filter must be unused. If a new process attaches to a container we force it to assume a seccomp profile. This can either be the same seccomp profile as the container was started with or a modified one. If the attaching process makes use of the seccomp notifier we will register a new seccomp notifier handler in the monitor's event loop. However, when the attaching process exits we can't simply delete the handler since other child processes could've been created (daemons spawned etc.) that have inherited the seccomp filter and so we need to keep the seccomp notifier fd alive in the event loop. But this is problematic since we don't get a notification when the seccomp filter has become unused and so we currently never remove the seccomp notifier fd from the event loop and just keep accumulating fds in the event loop. We've had this issue for a while but it has recently become more pressing as more and larger users make use of this. To fix this, we introduce a new "users" reference counter that tracks any tasks and dependent filters making use of a filter. When a notifier is registered waiting tasks will be notified that the filter is now empty by receiving a (E)POLLHUP event. The concept in this patch introduces is the same as for signal_struct, i.e. reference counting for life-cycle management is decoupled from reference counting taks using the object. There's probably some trickery possible but the second counter is just the correct way of doing this IMHO and has precedence. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-3-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 39c1c687d8ad..ba30bffa9301 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -110,6 +110,14 @@ struct notification { * attached task, once for the dependent filter, and if * requested for the user notifier. When @refs reaches zero, * the filter can be freed. + * @users: A filter's @users count is incremented for each directly + * attached task (filter installation, fork(), thread_sync), + * and once for the dependent filter (tracked in filter->prev). + * When it reaches zero it indicates that no direct or indirect + * users of that filter exist. No new tasks can get associated with + * this filter after reaching 0. The @users count is always smaller + * or equal to @refs. Hence, reaching 0 for @users does not mean + * the filter can be freed. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -129,6 +137,7 @@ struct notification { */ struct seccomp_filter { refcount_t refs; + refcount_t users; bool log; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -376,6 +385,15 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } +static void __seccomp_filter_orphan(struct seccomp_filter *orig) +{ + while (orig && refcount_dec_and_test(&orig->users)) { + if (waitqueue_active(&orig->wqh)) + wake_up_poll(&orig->wqh, EPOLLHUP); + orig = orig->prev; + } +} + static void __put_seccomp_filter(struct seccomp_filter *orig) { /* Clean up single-reference branches iteratively. */ @@ -386,10 +404,18 @@ static void __put_seccomp_filter(struct seccomp_filter *orig) } } +static void __seccomp_filter_release(struct seccomp_filter *orig) +{ + /* Notify about any unused filters in the task's former filter tree. */ + __seccomp_filter_orphan(orig); + /* Finally drop all references to the task's former tree. */ + __put_seccomp_filter(orig); +} + /** - * seccomp_filter_release - Detach the task from its filter tree - * and drop its reference count during - * exit. + * seccomp_filter_release - Detach the task from its filter tree, + * drop its reference count, and notify + * about unused filters * * This function should only be called when the task is exiting as * it detaches it from its filter tree. As such, READ_ONCE() and @@ -401,7 +427,7 @@ void seccomp_filter_release(struct task_struct *tsk) /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; - __put_seccomp_filter(orig); + __seccomp_filter_release(orig); } /** @@ -428,12 +454,15 @@ static inline void seccomp_sync_threads(unsigned long flags) /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); + /* * Drop the task reference to the shared ancestor since * current's path will hold a reference. (This also * allows a put before the assignment.) */ - __put_seccomp_filter(thread->seccomp.filter); + __seccomp_filter_release(thread->seccomp.filter); + + /* Make our new filter tree visible. */ smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, @@ -502,6 +531,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) } refcount_set(&sfilter->refs, 1); + refcount_set(&sfilter->users, 1); init_waitqueue_head(&sfilter->wqh); return sfilter; @@ -606,6 +636,7 @@ void get_seccomp_filter(struct task_struct *tsk) if (!orig) return; __get_seccomp_filter(orig); + refcount_inc(&orig->users); } static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) @@ -1234,6 +1265,9 @@ static __poll_t seccomp_notify_poll(struct file *file, mutex_unlock(&filter->notify_lock); + if (refcount_read(&filter->users) == 0) + ret |= EPOLLHUP; + return ret; } -- cgit v1.2.3 From e68f9d49dda1744d548426c8b4335a8d693a36d0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2020 22:02:56 -0700 Subject: seccomp: Use pr_fmt Avoid open-coding "seccomp: " prefixes for pr_*() calls. Signed-off-by: Kees Cook --- kernel/seccomp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ba30bffa9301..5f0e3f3a7a5d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -13,6 +13,7 @@ * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ +#define pr_fmt(fmt) "seccomp: " fmt #include #include @@ -1873,7 +1874,7 @@ static int __init seccomp_sysctl_init(void) hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); if (!hdr) - pr_warn("seccomp: sysctl registration failed\n"); + pr_warn("sysctl registration failed\n"); else kmemleak_not_leak(hdr); -- cgit v1.2.3 From 47e33c05f9f07cac3de833e531bcac9ae052c7ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2020 15:42:46 -0700 Subject: seccomp: Fix ioctl number for SECCOMP_IOCTL_NOTIF_ID_VALID When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced it had the wrong direction flag set. While this isn't a big deal as nothing currently enforces these bits in the kernel, it should be defined correctly. Fix the define and provide support for the old command until it is no longer needed for backward compatibility. Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Kees Cook --- kernel/seccomp.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0e3f3a7a5d..0ed57e8c49d0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,14 @@ #include #include +/* + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the + * wrong direction flag in the ioctl number. This is the broken one, + * which the kernel needs to keep supporting until all userspaces stop + * using the wrong command number. + */ +#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) + enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, @@ -1236,6 +1244,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); default: -- cgit v1.2.3 From fe4bfff86ec54773df3db79e8112e3b0f820c799 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Jun 2020 12:20:15 -0700 Subject: seccomp: Use -1 marker for end of mode 1 syscall list The terminator for the mode 1 syscalls list was a 0, but that could be a valid syscall number (e.g. x86_64 __NR_read). By luck, __NR_read was listed first and the loop construct would not test it, so there was no bug. However, this is fragile. Replace the terminator with -1 instead, and make the variable name for mode 1 syscall lists more descriptive. Cc: Andy Lutomirski Cc: Will Drewry Signed-off-by: Kees Cook --- kernel/seccomp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ed57e8c49d0..866a432cd746 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -742,20 +742,20 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, - 0, /* null terminated */ + -1, /* negative terminated */ }; static void __secure_computing_strict(int this_syscall) { - const int *syscall_whitelist = mode1_syscalls; + const int *allowed_syscalls = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) - syscall_whitelist = get_compat_mode1_syscalls(); + allowed_syscalls = get_compat_mode1_syscalls(); #endif do { - if (*syscall_whitelist == this_syscall) + if (*allowed_syscalls == this_syscall) return; - } while (*++syscall_whitelist); + } while (*++allowed_syscalls != -1); #ifdef SECCOMP_DEBUG dump_stack(); -- cgit v1.2.3 From bc885f1ab6de0d38c6956a71b0126543b64875b0 Mon Sep 17 00:00:00 2001 From: Bruno Meneguele Date: Fri, 10 Jul 2020 14:44:23 -0300 Subject: doc:kmsg: explicitly state the return value in case of SEEK_CUR The commit 625d3449788f ("Revert "kernel/printk: add kmsg SEEK_CUR handling"") reverted a change done to the return value in case a SEEK_CUR operation was performed for kmsg buffer based on the fact that different userspace apps were handling the new return value (-ESPIPE) in different ways, breaking them. At the same time -ESPIPE was the wrong decision because kmsg /does support/ seek() but doesn't follow the "normal" behavior userspace is used to. Because of that and also considering the time -EINVAL has been used, it was decided to keep this way to avoid more userspace breakage. This patch adds an official statement to the kmsg documentation pointing to the current return value for SEEK_CUR, -EINVAL, thus userspace libraries and apps can refer to it for a definitive guide on what to expect. Signed-off-by: Bruno Meneguele Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200710174423.10480-1-bmeneg@redhat.com --- kernel/printk/printk.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8c14835be46c..5f6eca65dd9a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -943,6 +943,14 @@ out: return ret; } +/* + * Be careful when modifying this function!!! + * + * Only few operations are supported because the device works only with the + * entire variable length messages (records). Non-standard values are + * returned in the other cases and has been this way for quite some time. + * User space applications might depend on this behavior. + */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; -- cgit v1.2.3 From c9a0f3b85e09dd16665b639cb884490410619434 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 11 Jul 2020 23:53:24 +0200 Subject: bpf: Resolve BTF IDs in vmlinux image Using BTF_ID_LIST macro to define lists for several helpers using BTF arguments. And running resolve_btfids on vmlinux elf object during linking, so the .BTF_ids section gets the IDs resolved. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200711215329.41165-5-jolsa@kernel.org --- kernel/bpf/stackmap.c | 5 ++++- kernel/trace/bpf_trace.c | 9 +++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a6c361ed7937..48d8e739975f 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -576,7 +577,9 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return __bpf_get_stack(regs, task, buf, size, flags); } -static int bpf_get_task_stack_btf_ids[5]; +BTF_ID_LIST(bpf_get_task_stack_btf_ids) +BTF_ID(struct, task_struct) + const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e0b7775039ab..e178e8e32b33 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -710,7 +711,9 @@ out: return err; } -static int bpf_seq_printf_btf_ids[5]; +BTF_ID_LIST(bpf_seq_printf_btf_ids) +BTF_ID(struct, seq_file) + static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, @@ -728,7 +731,9 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -static int bpf_seq_write_btf_ids[5]; +BTF_ID_LIST(bpf_seq_write_btf_ids) +BTF_ID(struct, seq_file) + static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, -- cgit v1.2.3 From 138b9a0511c789f2451ff1d80e7fd3f9eef3a9e3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 11 Jul 2020 23:53:25 +0200 Subject: bpf: Remove btf_id helpers resolving Now when we moved the helpers btf_id arrays into .BTF_ids section, we can remove the code that resolve those IDs in runtime. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200711215329.41165-6-jolsa@kernel.org --- kernel/bpf/btf.c | 89 ++++---------------------------------------------------- 1 file changed, 5 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4c3007f428b1..71140b73ae3c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4079,96 +4079,17 @@ error: return -EINVAL; } -static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, - int arg) -{ - char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; - const struct btf_param *args; - const struct btf_type *t; - const char *tname, *sym; - u32 btf_id, i; - - if (IS_ERR(btf_vmlinux)) { - bpf_log(log, "btf_vmlinux is malformed\n"); - return -EINVAL; - } - - sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4); - if (!sym) { - bpf_log(log, "kernel doesn't have kallsyms\n"); - return -EFAULT; - } - - for (i = 1; i <= btf_vmlinux->nr_types; i++) { - t = btf_type_by_id(btf_vmlinux, i); - if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) - continue; - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); - if (!strcmp(tname, fnname)) - break; - } - if (i > btf_vmlinux->nr_types) { - bpf_log(log, "helper %s type is not found\n", fnname); - return -ENOENT; - } - - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_ptr(t)) - return -EFAULT; - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_func_proto(t)) - return -EFAULT; - - args = (const struct btf_param *)(t + 1); - if (arg >= btf_type_vlen(t)) { - bpf_log(log, "bpf helper %s doesn't have %d-th argument\n", - fnname, arg); - return -EINVAL; - } - - t = btf_type_by_id(btf_vmlinux, args[arg].type); - if (!btf_type_is_ptr(t) || !t->type) { - /* anything but the pointer to struct is a helper config bug */ - bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n"); - return -EFAULT; - } - btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); - /* skip modifiers */ - while (btf_type_is_modifier(t)) { - btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); - } - if (!btf_type_is_struct(t)) { - bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n"); - return -EFAULT; - } - bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4, - arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off)); - return btf_id; -} - int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int arg) { - int *btf_id = &fn->btf_id[arg]; - int ret; + int id; if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID) return -EINVAL; - - ret = READ_ONCE(*btf_id); - if (ret) - return ret; - /* ok to race the search. The result is the same */ - ret = __btf_resolve_helper_id(log, fn->func, arg); - if (!ret) { - /* Function argument cannot be type 'void' */ - bpf_log(log, "BTF resolution bug\n"); - return -EFAULT; - } - WRITE_ONCE(*btf_id, ret); - return ret; + id = fn->btf_id[arg]; + if (!id || id > btf_vmlinux->nr_types) + return -EINVAL; + return id; } static int __get_type_size(struct btf *btf, u32 btf_id, -- cgit v1.2.3 From 49f4e6720748c778795df62d222d0200b61345be Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 11 Jul 2020 23:53:26 +0200 Subject: bpf: Use BTF_ID to resolve bpf_ctx_convert struct This way the ID is resolved during compile time, and we can remove the runtime name search. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200711215329.41165-7-jolsa@kernel.org --- kernel/bpf/btf.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 71140b73ae3c..a710e3ee1f18 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -3621,12 +3622,15 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, return kern_ctx_type->type; } +BTF_ID_LIST(bpf_ctx_convert_btf_id) +BTF_ID(struct, bpf_ctx_convert) + struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err, btf_id; + int err; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3659,14 +3663,8 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; - /* find struct bpf_ctx_convert for type checking later */ - btf_id = btf_find_by_name_kind(btf, "bpf_ctx_convert", BTF_KIND_STRUCT); - if (btf_id < 0) { - err = btf_id; - goto errout; - } /* btf_parse_vmlinux() runs under bpf_verifier_lock */ - bpf_ctx_convert.t = btf_type_by_id(btf, btf_id); + bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); /* find bpf map structs for map_ptr access checking */ err = btf_vmlinux_map_ids_init(btf, log); -- cgit v1.2.3 From 4969f8a073977123504609d7310b42a588297aa4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jun 2020 16:21:38 -0700 Subject: pidfd: Add missing sock updates for pidfd_getfd() The sock counting (sock_update_netprioidx() and sock_update_classid()) was missing from pidfd's implementation of received fd installation. Add a call to the new __receive_sock() helper. Cc: Christian Brauner Cc: Christoph Hellwig Cc: Sargun Dhillon Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 8649c322f75c ("pid: Implement pidfd_getfd syscall") Signed-off-by: Kees Cook --- kernel/pid.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index f1496b757162..ee58530d1aca 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include #include #include +#include struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -642,10 +643,12 @@ static int pidfd_getfd(struct pid *pid, int fd) } ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) + if (ret < 0) { fput(file); - else + } else { + __receive_sock(file); fd_install(ret, file); + } return ret; } -- cgit v1.2.3 From 910d2f16ac90463a1f5b03d53246c443e2b354b9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jun 2020 16:21:38 -0700 Subject: pidfd: Replace open-coded receive_fd() Replace the open-coded version of receive_fd() with a call to the new helper. Thanks to Vamshi K Sthambamkadi for catching a missed fput() in an earlier version of this patch. Cc: Christoph Hellwig Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Sargun Dhillon Acked-by: Christian Brauner Signed-off-by: Kees Cook --- kernel/pid.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index ee58530d1aca..da5aea5f04fa 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -636,19 +636,8 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = security_file_receive(file); - if (ret) { - fput(file); - return ret; - } - - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) { - fput(file); - } else { - __receive_sock(file); - fd_install(ret, file); - } + ret = receive_fd(file, O_CLOEXEC); + fput(file); return ret; } -- cgit v1.2.3 From ac5a72ea5c8989871e61f6bb0852e0f91de51ebe Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 13 Jul 2020 12:52:33 +0100 Subject: bpf: Use dedicated bpf_trace_printk event instead of trace_printk() The bpf helper bpf_trace_printk() uses trace_printk() under the hood. This leads to an alarming warning message originating from trace buffer allocation which occurs the first time a program using bpf_trace_printk() is loaded. We can instead create a trace event for bpf_trace_printk() and enable it in-kernel when/if we encounter a program using the bpf_trace_printk() helper. With this approach, trace_printk() is not used directly and no warning message appears. This work was started by Steven (see Link) and finished by Alan; added Steven's Signed-off-by with his permission. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20200628194334.6238b933@oasis.local.home Link: https://lore.kernel.org/bpf/1594641154-18897-2-git-send-email-alan.maguire@oracle.com --- kernel/trace/Makefile | 2 ++ kernel/trace/bpf_trace.c | 42 +++++++++++++++++++++++++++++++++++++----- kernel/trace/bpf_trace.h | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 kernel/trace/bpf_trace.h (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6575bb0a0434..aeba5ee7325a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,8 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif +CFLAGS_bpf_trace.o := -I$(src) + CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e178e8e32b33..3cc0dcb60ca2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,9 @@ #include "trace_probe.h" #include "trace.h" +#define CREATE_TRACE_POINTS +#include "bpf_trace.h" + #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) @@ -375,6 +379,30 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, } } +static DEFINE_RAW_SPINLOCK(trace_printk_lock); + +#define BPF_TRACE_PRINTK_SIZE 1024 + +static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +{ + static char buf[BPF_TRACE_PRINTK_SIZE]; + unsigned long flags; + va_list ap; + int ret; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); + va_start(ap, fmt); + ret = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + /* vsnprintf() will not append null for zero-length strings */ + if (ret == 0) + buf[0] = '\0'; + trace_bpf_trace_printk(buf); + raw_spin_unlock_irqrestore(&trace_printk_lock, flags); + + return ret; +} + /* * Only limited trace_printk() conversion specifiers allowed: * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s @@ -484,8 +512,7 @@ fmt_next: */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(0 /* Fake ip */, \ - fmt, ##__VA_ARGS__) + bpf_do_trace_printk(fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ @@ -522,10 +549,15 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { /* - * this program might be calling bpf_trace_printk, - * so allocate per-cpu printk buffers + * This program might be calling bpf_trace_printk, + * so enable the associated bpf_trace/bpf_trace_printk event. + * Repeat this each time as it is possible a user has + * disabled bpf_trace_printk events. By loading a program + * calling bpf_trace_printk() however the user has expressed + * the intent to see such events. */ - trace_printk_init_buffers(); + if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) + pr_warn_ratelimited("could not enable bpf_trace_printk events"); return &bpf_trace_printk_proto; } diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h new file mode 100644 index 000000000000..9acbc11ac7bb --- /dev/null +++ b/kernel/trace/bpf_trace.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_trace + +#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_BPF_TRACE_H + +#include + +TRACE_EVENT(bpf_trace_printk, + + TP_PROTO(const char *bpf_string), + + TP_ARGS(bpf_string), + + TP_STRUCT__entry( + __string(bpf_string, bpf_string) + ), + + TP_fast_assign( + __assign_str(bpf_string, bpf_string); + ), + + TP_printk("%s", __get_str(bpf_string)) +); + +#endif /* _TRACE_BPF_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE bpf_trace + +#include -- cgit v1.2.3 From 567f6a6eba0c09e5f502e0290e57651befa8aacb Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 14 Jul 2020 14:39:25 +0200 Subject: dma-direct: provide function to check physical memory area validity dma_coherent_ok() checks if a physical memory area fits a device's DMA constraints. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 95866b647581..67f060b86a73 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -70,7 +70,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); -- cgit v1.2.3 From 23e469be6239d9cf3d921fc3e38545491df56534 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 14 Jul 2020 14:39:26 +0200 Subject: dma-pool: get rid of dma_in_atomic_pool() The function is only used once and can be simplified to a one-liner. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 39ca26fa41b5..318035e093fb 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -217,15 +217,6 @@ static inline struct gen_pool *dev_to_pool(struct device *dev) return atomic_pool_kernel; } -static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) -{ - struct gen_pool *pool = dev_to_pool(dev); - - if (unlikely(!pool)) - return false; - return gen_pool_has_addr(pool, (unsigned long)start, size); -} - void *dma_alloc_from_pool(struct device *dev, size_t size, struct page **ret_page, gfp_t flags) { @@ -260,7 +251,7 @@ bool dma_free_from_pool(struct device *dev, void *start, size_t size) { struct gen_pool *pool = dev_to_pool(dev); - if (!dma_in_atomic_pool(dev, start, size)) + if (!pool || !gen_pool_has_addr(pool, (unsigned long)start, size)) return false; gen_pool_free(pool, (unsigned long)start, size); return true; -- cgit v1.2.3 From 48b6703858dd5526c82d8ff2dbac59acab3a9dda Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 14 Jul 2020 14:39:27 +0200 Subject: dma-pool: introduce dma_guess_pool() dma-pool's dev_to_pool() creates the false impression that there is a way to grantee a mapping between a device's DMA constraints and an atomic pool. It tuns out it's just a guess, and the device might need to use an atomic pool containing memory from a 'safer' (or lower) memory zone. To help mitigate this, introduce dma_guess_pool() which can be fed a device's DMA constraints and atomic pools already known to be faulty, in order for it to provide an better guess on which pool to use. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 318035e093fb..5b9eaa2b498d 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void) } postcore_initcall(dma_atomic_pool_init); -static inline struct gen_pool *dev_to_pool(struct device *dev) +static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev) { u64 phys_mask; gfp_t gfp; @@ -217,10 +217,30 @@ static inline struct gen_pool *dev_to_pool(struct device *dev) return atomic_pool_kernel; } +static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool) +{ + if (bad_pool == atomic_pool_kernel) + return atomic_pool_dma32 ? : atomic_pool_dma; + + if (bad_pool == atomic_pool_dma32) + return atomic_pool_dma; + + return NULL; +} + +static inline struct gen_pool *dma_guess_pool(struct device *dev, + struct gen_pool *bad_pool) +{ + if (bad_pool) + return dma_get_safer_pool(bad_pool); + + return dma_guess_pool_from_device(dev); +} + void *dma_alloc_from_pool(struct device *dev, size_t size, struct page **ret_page, gfp_t flags) { - struct gen_pool *pool = dev_to_pool(dev); + struct gen_pool *pool = dma_guess_pool(dev, NULL); unsigned long val; void *ptr = NULL; @@ -249,7 +269,7 @@ void *dma_alloc_from_pool(struct device *dev, size_t size, bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - struct gen_pool *pool = dev_to_pool(dev); + struct gen_pool *pool = dma_guess_pool(dev, NULL); if (!pool || !gen_pool_has_addr(pool, (unsigned long)start, size)) return false; -- cgit v1.2.3 From 81e9d894e03f9a279102c7aac62ea7cbf9949f4b Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 14 Jul 2020 14:39:28 +0200 Subject: dma-pool: make sure atomic pool suits device When allocating DMA memory from a pool, the core can only guess which atomic pool will fit a device's constraints. If it doesn't, get a safer atomic pool and try again. Fixes: c84dc6e68a1d ("dma-pool: add additional coherent pools to map to gfp mask") Reported-by: Jeremy Linton Suggested-by: Robin Murphy Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 57 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5b9eaa2b498d..d48d9acb585f 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -240,39 +240,56 @@ static inline struct gen_pool *dma_guess_pool(struct device *dev, void *dma_alloc_from_pool(struct device *dev, size_t size, struct page **ret_page, gfp_t flags) { - struct gen_pool *pool = dma_guess_pool(dev, NULL); - unsigned long val; + struct gen_pool *pool = NULL; + unsigned long val = 0; void *ptr = NULL; - - if (!pool) { - WARN(1, "%pGg atomic pool not initialised!\n", &flags); - return NULL; + phys_addr_t phys; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) { + WARN(1, "Failed to get suitable pool for %s\n", + dev_name(dev)); + break; + } + + val = gen_pool_alloc(pool, size); + if (!val) + continue; + + phys = gen_pool_virt_to_phys(pool, val); + if (dma_coherent_ok(dev, phys, size)) + break; + + gen_pool_free(pool, val, size); + val = 0; } - val = gen_pool_alloc(pool, size); - if (likely(val)) { - phys_addr_t phys = gen_pool_virt_to_phys(pool, val); + if (val) { *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; memset(ptr, 0, size); - } else { - WARN_ONCE(1, "DMA coherent pool depleted, increase size " - "(recommended min coherent_pool=%zuK)\n", - gen_pool_size(pool) >> 9); + + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); } - if (gen_pool_avail(pool) < atomic_pool_size) - schedule_work(&atomic_pool_work); return ptr; } bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - struct gen_pool *pool = dma_guess_pool(dev, NULL); + struct gen_pool *pool = NULL; - if (!pool || !gen_pool_has_addr(pool, (unsigned long)start, size)) - return false; - gen_pool_free(pool, (unsigned long)start, size); - return true; + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) + return false; + + if (gen_pool_has_addr(pool, (unsigned long)start, size)) { + gen_pool_free(pool, (unsigned long)start, size); + return true; + } + } } -- cgit v1.2.3 From d9765e41d8e9ea2251bf73735a2895c8bad546fc Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Wed, 8 Jul 2020 18:49:39 +0200 Subject: dma-pool: do not allocate pool memory from CMA There is no guarantee to CMA's placement, so allocating a zone specific atomic pool from CMA might return memory from a completely different memory zone. So stop using it. Fixes: c84dc6e68a1d ("dma-pool: add additional coherent pools to map to gfp mask") Reported-by: Jeremy Linton Signed-off-by: Nicolas Saenz Julienne Tested-by: Jeremy Linton Acked-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index d48d9acb585f..6bc74a2d5127 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -69,12 +68,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, do { pool_size = 1 << (PAGE_SHIFT + order); - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, 1 << order, - order, false); - else - page = alloc_pages(gfp, order); + page = alloc_pages(gfp, order); } while (!page && order-- > 0); if (!page) goto out; @@ -118,8 +112,7 @@ remove_mapping: dma_common_free_remap(addr, pool_size); #endif free_page: __maybe_unused - if (!dma_release_from_contiguous(NULL, page, 1 << order)) - __free_pages(page, order); + __free_pages(page, order); out: return ret; } -- cgit v1.2.3 From 6ada7ba2fa7710eead191b69fd948569b05d0f61 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Mon, 22 Jun 2020 17:09:13 +0800 Subject: PM: hibernate: fix white space in a few places In hibernate.c, some places lack of spaces while some places have redundant spaces. So fix them. Signed-off-by: Xiang Chen Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 02ec716a4927..5714f51ba9f8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1062,7 +1062,7 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } @@ -1162,7 +1162,7 @@ static ssize_t reserved_size_store(struct kobject *kobj, power_attr(reserved_size); -static struct attribute * g[] = { +static struct attribute *g[] = { &disk_attr.attr, &resume_offset_attr.attr, &resume_attr.attr, @@ -1190,7 +1190,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy( resume_file, str, 255 ); + strncpy(resume_file, str, 255); return 1; } -- cgit v1.2.3 From 02d7f4005e942f75c63302d54a659e673edd00df Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 10 Jul 2020 22:12:10 +0300 Subject: PM: sleep: spread "const char *" correctness Fixed string literals can be referred to as "const char *". Signed-off-by: Alexey Dobriyan [ rjw: Minor subject edit ] Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index ba2094db6294..32fc89ac96c3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -32,7 +32,7 @@ static inline int init_header_complete(struct swsusp_info *info) return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); } -static inline char *check_image_kernel(struct swsusp_info *info) +static inline const char *check_image_kernel(struct swsusp_info *info) { return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 881128b9351e..cef154261fe2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2023,7 +2023,7 @@ static int init_header_complete(struct swsusp_info *info) return 0; } -static char *check_image_kernel(struct swsusp_info *info) +static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; @@ -2176,7 +2176,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm) static int check_header(struct swsusp_info *info) { - char *reason; + const char *reason; reason = check_image_kernel(info); if (!reason && info->num_physpages != get_num_physpages()) -- cgit v1.2.3 From 7cf97b12545503992020796c74bd84078eb39299 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 2 Jun 2020 18:10:43 -0700 Subject: seccomp: Introduce addfd ioctl to seccomp user notifier The current SECCOMP_RET_USER_NOTIF API allows for syscall supervision over an fd. It is often used in settings where a supervising task emulates syscalls on behalf of a supervised task in userspace, either to further restrict the supervisee's syscall abilities or to circumvent kernel enforced restrictions the supervisor deems safe to lift (e.g. actually performing a mount(2) for an unprivileged container). While SECCOMP_RET_USER_NOTIF allows for the interception of any syscall, only a certain subset of syscalls could be correctly emulated. Over the last few development cycles, the set of syscalls which can't be emulated has been reduced due to the addition of pidfd_getfd(2). With this we are now able to, for example, intercept syscalls that require the supervisor to operate on file descriptors of the supervisee such as connect(2). However, syscalls that cause new file descriptors to be installed can not currently be correctly emulated since there is no way for the supervisor to inject file descriptors into the supervisee. This patch adds a new addfd ioctl to remove this restriction by allowing the supervisor to install file descriptors into the intercepted task. By implementing this feature via seccomp the supervisor effectively instructs the supervisee to install a set of file descriptors into its own file descriptor table during the intercepted syscall. This way it is possible to intercept syscalls such as open() or accept(), and install (or replace, like dup2(2)) the supervisor's resulting fd into the supervisee. One replacement use-case would be to redirect the stdout and stderr of a supervisee into log file descriptors opened by the supervisor. The ioctl handling is based on the discussions[1] of how Extensible Arguments should interact with ioctls. Instead of building size into the addfd structure, make it a function of the ioctl command (which is how sizes are normally passed to ioctls). To support forward and backward compatibility, just mask out the direction and size, and match everything. The size (and any future direction) checks are done along with copy_struct_from_user() logic. As a note, the seccomp_notif_addfd structure is laid out based on 8-byte alignment without requiring packing as there have been packing issues with uapi highlighted before[2][3]. Although we could overload the newfd field and use -1 to indicate that it is not to be used, doing so requires changing the size of the fd field, and introduces struct packing complexity. [1]: https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/ [2]: https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/ [3]: https://lore.kernel.org/lkml/20200612104629.GA15814@ircssh-2.c.rugged-nimbus-611.internal Cc: Christoph Hellwig Cc: Christian Brauner Cc: Tycho Andersen Cc: Jann Horn Cc: Robert Sesek Cc: Chris Palmer Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-api@vger.kernel.org Suggested-by: Matt Denton Link: https://lore.kernel.org/r/20200603011044.7972-4-sargun@sargun.me Signed-off-by: Sargun Dhillon Reviewed-by: Will Drewry Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- kernel/seccomp.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 866a432cd746..3ee59ce0a323 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -87,10 +87,42 @@ struct seccomp_knotif { long val; u32 flags; - /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + /* + * Signals when this has changed states, such as the listener + * dying, a new seccomp addfd message, or changing to REPLIED + */ struct completion ready; struct list_head list; + + /* outstanding addfd requests */ + struct list_head addfd; +}; + +/** + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages + * + * @file: A reference to the file to install in the other task + * @fd: The fd number to install it at. If the fd number is -1, it means the + * installing process should allocate the fd as normal. + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC + * is allowed. + * @ret: The return value of the installing process. It is set to the fd num + * upon success (>= 0). + * @completion: Indicates that the installing process has completed fd + * installation, or gone away (either due to successful + * reply, or signal) + * + */ +struct seccomp_kaddfd { + struct file *file; + int fd; + unsigned int flags; + + /* To only be set on reply */ + int ret; + struct completion completion; + struct list_head list; }; /** @@ -793,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +{ + /* + * Remove the notification, and reset the list pointers, indicating + * that it has been handled. + */ + list_del_init(&addfd->list); + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + complete(&addfd->completion); +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -801,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall, u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; + struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; @@ -813,6 +857,7 @@ static int seccomp_do_user_notification(int this_syscall, n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add(&n.list, &match->notif->notifications); + INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); @@ -821,17 +866,34 @@ static int seccomp_do_user_notification(int this_syscall, /* * This is where we wait for a reply from userspace. */ +wait: err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err == 0) { + /* Check if we were woken up by a addfd message */ + addfd = list_first_entry_or_null(&n.addfd, + struct seccomp_kaddfd, list); + if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + seccomp_handle_addfd(addfd); + mutex_unlock(&match->notify_lock); + goto wait; + } ret = n.val; err = n.error; flags = n.flags; } + /* If there were any pending addfd calls, clear them out */ + list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { + /* The process went away before we got a chance to handle it */ + addfd->ret = -ESRCH; + list_del_init(&addfd->list); + complete(&addfd->completion); + } + /* * Note that it's possible the listener died in between the time when - * we were notified of a respons (or a signal) and when we were able to + * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * @@ -1069,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) knotif->error = -ENOSYS; knotif->val = 0; + /* + * We do not need to wake up any pending addfd messages, as + * the notifier will do that for us, as this just looks + * like a standard reply. + */ complete(&knotif->ready); } @@ -1233,12 +1300,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, return ret; } +static long seccomp_notify_addfd(struct seccomp_filter *filter, + struct seccomp_notif_addfd __user *uaddfd, + unsigned int size) +{ + struct seccomp_notif_addfd addfd; + struct seccomp_knotif *knotif; + struct seccomp_kaddfd kaddfd; + int ret; + + BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); + BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); + + if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) + return -EINVAL; + + ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); + if (ret) + return ret; + + if (addfd.newfd_flags & ~O_CLOEXEC) + return -EINVAL; + + if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + return -EINVAL; + + if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) + return -EINVAL; + + kaddfd.file = fget(addfd.srcfd); + if (!kaddfd.file) + return -EBADF; + + kaddfd.flags = addfd.newfd_flags; + kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? + addfd.newfd : -1; + init_completion(&kaddfd.completion); + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + goto out; + + knotif = find_notification(filter, addfd.id); + if (!knotif) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * We do not want to allow for FD injection to occur before the + * notification has been picked up by a userspace handler, or after + * the notification has been replied to. + */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out_unlock; + } + + list_add(&kaddfd.list, &knotif->addfd); + complete(&knotif->ready); + mutex_unlock(&filter->notify_lock); + + /* Now we wait for it to be processed or be interrupted */ + ret = wait_for_completion_interruptible(&kaddfd.completion); + if (ret == 0) { + /* + * We had a successful completion. The other side has already + * removed us from the addfd queue, and + * wait_for_completion_interruptible has a memory barrier upon + * success that lets us read this value directly without + * locking. + */ + ret = kaddfd.ret; + goto out; + } + + mutex_lock(&filter->notify_lock); + /* + * Even though we were woken up by a signal and not a successful + * completion, a completion may have happened in the mean time. + * + * We need to check again if the addfd request has been handled, + * and if not, we will remove it from the queue. + */ + if (list_empty(&kaddfd.list)) + ret = kaddfd.ret; + else + list_del(&kaddfd.list); + +out_unlock: + mutex_unlock(&filter->notify_lock); +out: + fput(kaddfd.file); + + return ret; +} + static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seccomp_filter *filter = file->private_data; void __user *buf = (void __user *)arg; + /* Fixed-size ioctls */ switch (cmd) { case SECCOMP_IOCTL_NOTIF_RECV: return seccomp_notify_recv(filter, buf); @@ -1247,6 +1411,13 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + } + + /* Extensible Argument ioctls */ +#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) + switch (EA_IOCTL(cmd)) { + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): + return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); default: return -EINVAL; } -- cgit v1.2.3 From 5b801dfb7feb2738975d80223efc2fc193e55573 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 14 Jul 2020 14:09:04 -0400 Subject: bpf: Fix NULL pointer dereference in __btf_resolve_helper_id() Prevent __btf_resolve_helper_id() from dereferencing `btf_vmlinux` as NULL. This patch fixes the following syzbot bug: https://syzkaller.appspot.com/bug?id=f823224ada908fa5c207902a5a62065e53ca0fcc Reported-by: syzbot+ee09bda7017345f1fbe6@syzkaller.appspotmail.com Signed-off-by: Peilin Ye Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200714180904.277512-1-yepeilin.cs@gmail.com --- kernel/bpf/btf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9a1a98dd9e97..0443600146dc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4058,6 +4058,11 @@ static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, const char *tname, *sym; u32 btf_id, i; + if (!btf_vmlinux) { + bpf_log(log, "btf_vmlinux doesn't exist\n"); + return -EINVAL; + } + if (IS_ERR(btf_vmlinux)) { bpf_log(log, "btf_vmlinux is malformed\n"); return -EINVAL; -- cgit v1.2.3 From d3fa60d7bfdc9a0ff1a524bdef96b3db1fd62022 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 09:45:11 +0200 Subject: dma-mapping: move the remaining DMA API calls out of line For a long time the DMA API has been implemented inline in dma-mapping.h, but the function bodies can be quite large. Move them all out of line. This also removes all the dma_direct_* exports as those are just implementation details and should never be used by drivers directly. Signed-off-by: Christoph Hellwig Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- kernel/dma/direct.c | 9 --- kernel/dma/mapping.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 67f060b86a73..e545d14fccee 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -315,7 +315,6 @@ void dma_direct_sync_single_for_device(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, size, dir); } -EXPORT_SYMBOL(dma_direct_sync_single_for_device); void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) @@ -335,7 +334,6 @@ void dma_direct_sync_sg_for_device(struct device *dev, dir); } } -EXPORT_SYMBOL(dma_direct_sync_sg_for_device); #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ @@ -354,7 +352,6 @@ void dma_direct_sync_single_for_cpu(struct device *dev, if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); } -EXPORT_SYMBOL(dma_direct_sync_single_for_cpu); void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) @@ -376,7 +373,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu_all(); } -EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu); void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -389,7 +385,6 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, if (unlikely(is_swiotlb_buffer(phys))) swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); } -EXPORT_SYMBOL(dma_direct_unmap_page); void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) @@ -401,7 +396,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } -EXPORT_SYMBOL(dma_direct_unmap_sg); #endif dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, @@ -428,7 +422,6 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, arch_sync_dma_for_device(phys, size, dir); return dma_addr; } -EXPORT_SYMBOL(dma_direct_map_page); int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) @@ -450,7 +443,6 @@ out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return 0; } -EXPORT_SYMBOL(dma_direct_map_sg); dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -467,7 +459,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, return dma_addr; } -EXPORT_SYMBOL(dma_direct_map_resource); int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a8c18c9a796f..b53953024512 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -105,6 +105,170 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); +static inline bool dma_is_direct(const struct dma_map_ops *ops) +{ + return likely(!ops); +} + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + else + addr = ops->map_page(dev, page, offset, size, dir, attrs); + debug_dma_map_page(dev, page, offset, size, dir, addr); + + return addr; +} +EXPORT_SYMBOL(dma_map_page_attrs); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_unmap_page(dev, addr, size, dir, attrs); + else if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unmap_page_attrs); + +/* + * dma_maps_sg_attrs returns 0 on error and > 0 on success. + * It should never return a value < 0. + */ +int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + int ents; + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); + else + ents = ops->map_sg(dev, sg, nents, dir, attrs); + BUG_ON(ents < 0); + debug_dma_map_sg(dev, sg, nents, ents, dir); + + return ents; +} +EXPORT_SYMBOL(dma_map_sg_attrs); + +void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + debug_dma_unmap_sg(dev, sg, nents, dir); + if (dma_is_direct(ops)) + dma_direct_unmap_sg(dev, sg, nents, dir, attrs); + else if (ops->unmap_sg) + ops->unmap_sg(dev, sg, nents, dir, attrs); +} +EXPORT_SYMBOL(dma_unmap_sg_attrs); + +dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr = DMA_MAPPING_ERROR; + + BUG_ON(!valid_dma_direction(dir)); + + /* Don't allow RAM to be mapped */ + if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) + return DMA_MAPPING_ERROR; + + if (dma_is_direct(ops)) + addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); + else if (ops->map_resource) + addr = ops->map_resource(dev, phys_addr, size, dir, attrs); + + debug_dma_map_resource(dev, phys_addr, size, dir, addr); + return addr; +} +EXPORT_SYMBOL(dma_map_resource); + +void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (!dma_is_direct(ops) && ops->unmap_resource) + ops->unmap_resource(dev, addr, size, dir, attrs); + debug_dma_unmap_resource(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unmap_resource); + +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + else if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(dev, addr, size, dir); + debug_dma_sync_single_for_cpu(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_sync_single_for_cpu); + +void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_sync_single_for_device(dev, addr, size, dir); + else if (ops->sync_single_for_device) + ops->sync_single_for_device(dev, addr, size, dir); + debug_dma_sync_single_for_device(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_sync_single_for_device); + +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); + else if (ops->sync_sg_for_cpu) + ops->sync_sg_for_cpu(dev, sg, nelems, dir); + debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); +} +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_sync_sg_for_device(dev, sg, nelems, dir); + else if (ops->sync_sg_for_device) + ops->sync_sg_for_device(dev, sg, nelems, dir); + debug_dma_sync_sg_for_device(dev, sg, nelems, dir); +} +EXPORT_SYMBOL(dma_sync_sg_for_device); + /* * Create scatter-list for the already allocated DMA buffer. */ -- cgit v1.2.3 From b4174173005972f8f6497883d08d87e0aba1b604 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 09:47:08 +0200 Subject: dma-mapping: inline the fast path dma-direct calls Inline the single page map/unmap/sync dma-direct calls into the now out of line generic wrappers. This restores the behavior of a single function call that we had before moving the generic calls out of line. Besides the dma-mapping callers there are just a few callers in IOMMU drivers that have a bypass mode, and more of those are going to be switched to the generic bypass soon. Signed-off-by: Christoph Hellwig Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- kernel/dma/direct.c | 65 ----------------------------------------------------- 1 file changed, 65 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index e545d14fccee..bb0041e99659 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -10,11 +10,9 @@ #include #include #include -#include #include #include #include -#include /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it @@ -304,18 +302,6 @@ void dma_direct_free(struct device *dev, size_t size, #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - phys_addr_t paddr = dma_to_phys(dev, addr); - - if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); - - if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(paddr, size, dir); -} - void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -339,20 +325,6 @@ void dma_direct_sync_sg_for_device(struct device *dev, #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - phys_addr_t paddr = dma_to_phys(dev, addr); - - if (!dev_is_dma_coherent(dev)) { - arch_sync_dma_for_cpu(paddr, size, dir); - arch_sync_dma_for_cpu_all(); - } - - if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); -} - void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -374,18 +346,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu_all(); } -void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - phys_addr_t phys = dma_to_phys(dev, addr); - - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); - - if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); -} - void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { @@ -398,31 +358,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, } #endif -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); - - if (unlikely(swiotlb_force == SWIOTLB_FORCE)) - return swiotlb_map(dev, phys, size, dir, attrs); - - if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - if (swiotlb_force != SWIOTLB_NO_FORCE) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; - } - - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(phys, size, dir); - return dma_addr; -} - int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { -- cgit v1.2.3 From 9b74ebb2b0f259474da65fa0178c657e5fa5c640 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 14 Jul 2020 15:56:34 +0200 Subject: cpumap: Use non-locked version __ptr_ring_consume_batched Commit 77361825bb01 ("bpf: cpumap use ptr_ring_consume_batched") changed away from using single frame ptr_ring dequeue (__ptr_ring_consume) to consume a batched, but it uses a locked version, which as the comment explain isn't needed. Change to use the non-locked version __ptr_ring_consume_batched. Fixes: 77361825bb01 ("bpf: cpumap use ptr_ring_consume_batched") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/a9c7d06f9a009e282209f0c8c7b2c5d9b9ad60b9.1594734381.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index bd8658055c16..323c91c4fab0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -259,7 +259,7 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; -- cgit v1.2.3 From 644bfe51fa49c22244d24e896cd3fe3ee2f2cfd1 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:37 +0200 Subject: cpumap: Formalize map value as a named struct As it has been already done for devmap, introduce 'struct bpf_cpumap_val' to formalize the expected values that can be passed in for a CPUMAP. Update cpumap code to use the struct. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/754f950674665dae6139c061d28c1d982aaf4170.1594734381.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 323c91c4fab0..ff48dc00e8d0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -52,7 +52,6 @@ struct xdp_bulk_queue { struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ - u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; @@ -62,10 +61,13 @@ struct bpf_cpu_map_entry { /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; - struct work_struct kthread_stop_wq; + + struct bpf_cpumap_val value; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; + + struct work_struct kthread_stop_wq; }; struct bpf_cpu_map { @@ -307,8 +309,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, - int map_id) +static struct bpf_cpu_map_entry * +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -338,13 +340,13 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->queue) goto free_bulkq; - err = ptr_ring_init(rcpu->queue, qsize, gfp); + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map_id; - rcpu->qsize = qsize; + rcpu->value.qsize = value->qsize; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, @@ -437,12 +439,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; - /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; - /* Value is the queue size */ - u32 qsize = *(u32 *)value; + + memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -450,18 +452,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; - if (qsize == 0) { + if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; @@ -523,7 +525,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); - return rcpu ? &rcpu->qsize : NULL; + return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -- cgit v1.2.3 From 9216477449f33cdbc9c9a99d49f500b7fbb81702 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:38 +0200 Subject: bpf: cpumap: Add the possibility to attach an eBPF program to cpumap Introduce the capability to attach an eBPF program to cpumap entries. The idea behind this feature is to add the possibility to define on which CPU run the eBPF program if the underlying hw does not support RSS. Current supported verdicts are XDP_DROP and XDP_PASS. This patch has been tested on Marvell ESPRESSObin using xdp_redirect_cpu sample available in the kernel tree to identify possible performance regressions. Results show there are no observable differences in packet-per-second: $./xdp_redirect_cpu --progname xdp_cpu_map0 --dev eth0 --cpu 1 rx: 354.8 Kpps rx: 356.0 Kpps rx: 356.8 Kpps rx: 356.3 Kpps rx: 356.6 Kpps rx: 356.6 Kpps rx: 356.7 Kpps rx: 355.8 Kpps rx: 356.8 Kpps rx: 356.8 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/5c9febdf903d810b3415732e5cd98491d7d9067a.1594734381.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 108 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ff48dc00e8d0..b3a8aea81ee5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -63,6 +63,7 @@ struct bpf_cpu_map_entry { struct task_struct *kthread; struct bpf_cpumap_val value; + struct bpf_prog *prog; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; @@ -82,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { + u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; u64 cost; @@ -92,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || + attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); cmap = kzalloc(sizeof(*cmap), GFP_USER); @@ -214,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { if (atomic_dec_and_test(&rcpu->refcnt)) { + if (rcpu->prog) + bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); @@ -222,6 +228,62 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, + void **frames, int n, + struct xdp_cpumap_stats *stats) +{ + struct xdp_rxq_info rxq; + struct xdp_buff xdp; + int i, nframes = 0; + + if (!rcpu->prog) + return n; + + rcu_read_lock(); + + xdp_set_return_frame_no_direct(); + xdp.rxq = &rxq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + rxq.dev = xdpf->dev_rx; + rxq.mem = xdpf->mem; + /* TODO: report queue_index to xdp_rxq_info */ + + xdp_convert_frame_to_buff(xdpf, &xdp); + + act = bpf_prog_run_xdp(rcpu->prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (err < 0) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + frames[nframes++] = xdpf; + stats->pass++; + } + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fallthrough */ + case XDP_DROP: + xdp_return_frame(xdpf); + stats->drop++; + break; + } + } + + xdp_clear_return_frame_no_direct(); + + rcu_read_unlock(); + + return nframes; +} + #define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) @@ -236,11 +298,12 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_cpumap_stats stats = {}; /* zero stats */ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m; + int i, n, m, nframes; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -261,8 +324,8 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - + n = __ptr_ring_consume_batched(rcpu->queue, frames, + CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page = virt_to_page(f); @@ -274,15 +337,19 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } - m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < n; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - drops = n; + /* Support running another XDP prog on this CPU */ + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + if (nframes) { + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < nframes; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops += nframes; + } } local_bh_disable(); - for (i = 0; i < n; i++) { + for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; int ret; @@ -299,7 +366,7 @@ static int cpu_map_kthread_run(void *data) drops++; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -309,13 +376,38 @@ static int cpu_map_kthread_run(void *data) return 0; } +bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_CPUMAP && + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); +} + +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + bpf_prog_put(prog); + return -EINVAL; + } + + rcpu->value.bpf_prog.id = prog->aux->id; + rcpu->prog = prog; + + return 0; +} + static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { + int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; - int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -357,6 +449,9 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; + /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); -- cgit v1.2.3 From 28b1520ebf81ced970141640d90279ac7b9f1f9a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:39 +0200 Subject: bpf: cpumap: Implement XDP_REDIRECT for eBPF programs attached to map entries Introduce XDP_REDIRECT support for eBPF programs attached to cpumap entries. This patch has been tested on Marvell ESPRESSObin using a modified version of xdp_redirect_cpu sample in order to attach a XDP program to CPUMAP entries to perform a redirect on the mvneta interface. In particular the following scenario has been tested: rq (cpu0) --> mvneta - XDP_REDIRECT (cpu0) --> CPUMAP - XDP_REDIRECT (cpu1) --> mvneta $./xdp_redirect_cpu -p xdp_cpu_map0 -d eth0 -c 1 -e xdp_redirect \ -f xdp_redirect_kern.o -m tx_port -r eth0 tx: 285.2 Kpps rx: 285.2 Kpps Attaching a simple XDP program on eth0 to perform XDP_TX gives comparable results: tx: 288.4 Kpps rx: 288.4 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/2cf8373a731867af302b00c4ff16c122630c4980.1594734381.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b3a8aea81ee5..4c95d0615ca2 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -239,7 +239,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, if (!rcpu->prog) return n; - rcu_read_lock(); + rcu_read_lock_bh(); xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; @@ -267,6 +267,16 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, stats->pass++; } break; + case XDP_REDIRECT: + err = xdp_do_redirect(xdpf->dev_rx, &xdp, + rcpu->prog); + if (unlikely(err)) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + stats->redirect++; + } + break; default: bpf_warn_invalid_xdp_action(act); /* fallthrough */ @@ -277,9 +287,12 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } + if (stats->redirect) + xdp_do_flush_map(); + xdp_clear_return_frame_no_direct(); - rcu_read_unlock(); + rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ return nframes; } -- cgit v1.2.3 From 3f649ab728cda8038259d8f14492fe400fbab911 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jun 2020 13:09:38 -0700 Subject: treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe # IB Acked-by: Kalle Valo # wireless drivers Reviewed-by: Chao Yu # erofs Signed-off-by: Kees Cook --- kernel/async.c | 4 ++-- kernel/audit.c | 2 +- kernel/debug/kdb/kdb_io.c | 2 +- kernel/dma/debug.c | 2 +- kernel/events/core.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/exit.c | 2 +- kernel/futex.c | 14 +++++++------- kernel/locking/lockdep.c | 16 ++++++++-------- kernel/trace/ring_buffer.c | 2 +- 10 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 4f9c1d614016..33258e6e20f8 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -111,7 +111,7 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t uninitialized_var(calltime), delta, rettime; + ktime_t calltime, delta, rettime; /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { @@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); */ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { - ktime_t uninitialized_var(starttime), delta, endtime; + ktime_t starttime, delta, endtime; if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); diff --git a/kernel/audit.c b/kernel/audit.c index 8c201f414226..ec38479f9228 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1800,7 +1800,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, { struct audit_buffer *ab; struct timespec64 t; - unsigned int uninitialized_var(serial); + unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 683a799618ad..9d847ab851db 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -591,7 +591,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; - unsigned long uninitialized_var(flags); + unsigned long flags; /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 36c962a86bf2..d628ab09d97b 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -882,7 +882,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct dma_debug_entry *uninitialized_var(entry); + struct dma_debug_entry *entry; int count; if (dma_debug_disabled()) diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..851fc5e0e24b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11483,7 +11483,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *uninitialized_var(gctx); + struct perf_event_context *ctx, *gctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0862873dba..e84eb52b646b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2189,7 +2189,7 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int uninitialized_var(is_swbp); + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == get_trampoline_vaddr()) diff --git a/kernel/exit.c b/kernel/exit.c index 727150f28103..7bcd571618dd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -93,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - struct tty_struct *uninitialized_var(tty); + struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, diff --git a/kernel/futex.c b/kernel/futex.c index e646661f6282..05e88562de68 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1326,7 +1326,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { int err; - u32 uninitialized_var(curval); + u32 curval; if (unlikely(should_fail_futex(true))) return -EFAULT; @@ -1496,7 +1496,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) */ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - u32 uninitialized_var(curval), newval; + u32 curval, newval; struct task_struct *new_owner; bool postunlock = false; DEFINE_WAKE_Q(wake_q); @@ -2370,7 +2370,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, uninitialized_var(curval), newval; + u32 uval, curval, newval; struct task_struct *oldowner, *newowner; u32 newtid; int ret, err = 0; @@ -2996,7 +2996,7 @@ uaddr_faulted: */ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { - u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); + u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; struct futex_q *top_waiter; @@ -3479,7 +3479,7 @@ err_unlock: static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { - u32 uval, uninitialized_var(nval), mval; + u32 uval, nval, mval; int err; /* Futex address must be 32bit aligned */ @@ -3609,7 +3609,7 @@ static void exit_robust_list(struct task_struct *curr) struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); + unsigned int next_pi; unsigned long futex_offset; int rc; @@ -3909,7 +3909,7 @@ static void compat_exit_robust_list(struct task_struct *curr) struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); + unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..84ed1d1d5013 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1723,7 +1723,7 @@ static int noop_count(struct lock_list *entry, void *data) static unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; __bfs_forwards(this, (void *)&count, noop_count, &target_entry); @@ -1749,7 +1749,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) static unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; __bfs_backwards(this, (void *)&count, noop_count, &target_entry); @@ -1804,7 +1804,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { int ret; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), .parent = NULL, @@ -1842,7 +1842,7 @@ static noinline int check_redundant(struct held_lock *src, struct held_lock *target) { int ret; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), .parent = NULL, @@ -2244,8 +2244,8 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, { unsigned long usage_mask = 0, forward_mask, backward_mask; enum lock_usage_bit forward_bit = 0, backward_bit = 0; - struct lock_list *uninitialized_var(target_entry1); - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry1; + struct lock_list *target_entry; struct lock_list this, that; int ret; @@ -3438,7 +3438,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, { int ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); @@ -3465,7 +3465,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, { int ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 00867ff82412..f15471ce969e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -577,7 +577,7 @@ static void rb_wake_up_waiters(struct irq_work *work) */ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) { - struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); + struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; int ret = 0; -- cgit v1.2.3 From 01cfcde9c26d8555f0e6e9aea9d6049f87683998 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 10 Jul 2020 17:24:26 +0200 Subject: sched/fair: handle case of task_h_load() returning 0 task_h_load() can return 0 in some situations like running stress-ng mmapfork, which forks thousands of threads, in a sched group on a 224 cores system. The load balance doesn't handle this correctly because env->imbalance never decreases and it will stop pulling tasks only after reaching loop_max, which can be equal to the number of running tasks of the cfs. Make sure that imbalance will be decreased by at least 1. misfit task is the other feature that doesn't handle correctly such situation although it's probably more difficult to face the problem because of the smaller number of CPUs and running tasks on heterogenous system. We can't simply ensure that task_h_load() returns at least one because it would imply to handle underflow in other places. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Cc: # v4.4+ Link: https://lkml.kernel.org/r/20200710152426.16981-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 658aa7a2ae6f..04fa8dbcfa4d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4039,7 +4039,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -7638,7 +7642,14 @@ static int detach_tasks(struct lb_env *env) switch (env->migration_type) { case migrate_load: - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) -- cgit v1.2.3 From e2a71bdea81690b6ef11f4368261ec6f5b6891aa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:40 +0200 Subject: timer: Fix wheel index calculation on last level When an expiration delta falls into the last level of the wheel, that delta has be compared against the maximum possible delay and reduced to fit in if necessary. However instead of comparing the delta against the maximum, the code compares the actual expiry against the maximum. Then instead of fixing the delta to fit in, it sets the maximum delta as the expiry value. This can result in various undesired outcomes, the worst possible one being a timer expiring 15 days ahead to fire immediately. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200717140551.29076-2-frederic@kernel.org --- kernel/time/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9a838d38dbe6..df1ff803acc4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -521,8 +521,8 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ - if (expires >= WHEEL_TIMEOUT_CUTOFF) - expires = WHEEL_TIMEOUT_MAX; + if (delta >= WHEEL_TIMEOUT_CUTOFF) + expires = clk + WHEEL_TIMEOUT_MAX; idx = calc_index(expires, LVL_DEPTH - 1); } -- cgit v1.2.3 From 3d2e83a2a6a0657c1cf145fa6ba23620715d6c36 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:41 +0200 Subject: timers: Preserve higher bits of expiration on index calculation The higher bits of the timer expiration are cropped while calling calc_index() due to the implicit cast from unsigned long to unsigned int. This loss shouldn't have consequences on the current code since all the computation to calculate the index is done on the lower 32 bits. However to prepare for returning the actual bucket expiration from calc_index() in order to properly fix base->next_expiry updates, the higher bits need to be preserved. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200717140551.29076-3-frederic@kernel.org --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index df1ff803acc4..bcdc3045138d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -487,7 +487,7 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl) { expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); -- cgit v1.2.3 From 1f32cab0db4bdf6491eb4a60838f278e01c31698 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 17 Jul 2020 16:05:42 +0200 Subject: timers: Use only bucket expiry for base->next_expiry value The bucket expiry time is the effective expriy time of timers and is greater than or equal to the requested timer expiry time. This is due to the guarantee that timers never expire early and the reduced expiry granularity in the secondary wheel levels. When a timer is enqueued, trigger_dyntick_cpu() checks whether the timer is the new first timer. This check compares next_expiry with the requested timer expiry value and not with the effective expiry value of the bucket into which the timer was queued. Storing the requested timer expiry value in base->next_expiry can lead to base->clk going backwards if the requested timer expiry value is smaller than base->clk. Commit 30c66fc30ee7 ("timer: Prevent base->clk from moving backward") worked around this by preventing the store when timer->expiry is before base->clk, but did not fix the underlying problem. Use the expiry value of the bucket into which the timer is queued to do the new first timer check. This fixes the base->clk going backward problem. The workaround of commit 30c66fc30ee7 ("timer: Prevent base->clk from moving backward") in trigger_dyntick_cpu() is not longer necessary as the timers bucket expiry is guaranteed to be greater than or equal base->clk. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200717140551.29076-4-frederic@kernel.org --- kernel/time/timer.c | 64 ++++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bcdc3045138d..a7a3cf737411 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -487,35 +487,39 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned long expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl, + unsigned long *bucket_expiry) { expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } -static int calc_wheel_index(unsigned long expires, unsigned long clk) +static int calc_wheel_index(unsigned long expires, unsigned long clk, + unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { - idx = calc_index(expires, 0); + idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { - idx = calc_index(expires, 1); + idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { - idx = calc_index(expires, 2); + idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { - idx = calc_index(expires, 3); + idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { - idx = calc_index(expires, 4); + idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { - idx = calc_index(expires, 5); + idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { - idx = calc_index(expires, 6); + idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { - idx = calc_index(expires, 7); + idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; + *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the @@ -524,7 +528,7 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; - idx = calc_index(expires, LVL_DEPTH - 1); + idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } @@ -544,16 +548,18 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, } static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer) +__internal_add_timer(struct timer_base *base, struct timer_list *timer, + unsigned long *bucket_expiry) { unsigned int idx; - idx = calc_wheel_index(timer->expires, base->clk); + idx = calc_wheel_index(timer->expires, base->clk, bucket_expiry); enqueue_timer(base, timer, idx); } static void -trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, + unsigned long bucket_expiry) { if (!is_timers_nohz_active()) return; @@ -576,31 +582,29 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) if (!base->is_idle) return; - /* Check whether this is the new first expiring timer: */ - if (time_after_eq(timer->expires, base->next_expiry)) + /* + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. + */ + if (time_after_eq(bucket_expiry, base->next_expiry)) return; /* * Set the next expiry time and kick the CPU so it can reevaluate the * wheel: */ - if (time_before(timer->expires, base->clk)) { - /* - * Prevent from forward_timer_base() moving the base->clk - * backward - */ - base->next_expiry = base->clk; - } else { - base->next_expiry = timer->expires; - } + base->next_expiry = bucket_expiry; wake_up_nohz_cpu(base->cpu); } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { - __internal_add_timer(base, timer); - trigger_dyntick_cpu(base, timer); + unsigned long bucket_expiry; + + __internal_add_timer(base, timer, &bucket_expiry); + trigger_dyntick_cpu(base, timer, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS @@ -959,9 +963,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { + unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; - unsigned long clk = 0, flags; int ret = 0; BUG_ON(!timer->function); @@ -1000,7 +1004,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } clk = base->clk; - idx = calc_wheel_index(expires, clk); + idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending @@ -1059,7 +1063,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer); + trigger_dyntick_cpu(base, timer, bucket_expiry); } else { internal_add_timer(base, timer); } -- cgit v1.2.3 From 9a2b764b06c880678416d803d027f575ae40ec99 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:43 +0200 Subject: timers: Move trigger_dyntick_cpu() to enqueue_timer() Consolidate the code by calling trigger_dyntick_cpu() from enqueue_timer() instead of calling it from all its callers. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-5-frederic@kernel.org --- kernel/time/timer.c | 61 ++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a7a3cf737411..2af08a169564 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -533,30 +533,6 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, return idx; } -/* - * Enqueue the timer into the hash bucket, mark it pending in - * the bitmap and store the index in the timer flags. - */ -static void enqueue_timer(struct timer_base *base, struct timer_list *timer, - unsigned int idx) -{ - hlist_add_head(&timer->entry, base->vectors + idx); - __set_bit(idx, base->pending_map); - timer_set_idx(timer, idx); - - trace_timer_start(timer, timer->expires, timer->flags); -} - -static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer, - unsigned long *bucket_expiry) -{ - unsigned int idx; - - idx = calc_wheel_index(timer->expires, base->clk, bucket_expiry); - enqueue_timer(base, timer, idx); -} - static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, unsigned long bucket_expiry) @@ -598,15 +574,31 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, wake_up_nohz_cpu(base->cpu); } -static void -internal_add_timer(struct timer_base *base, struct timer_list *timer) +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap, store the index in the timer flags then wake up + * the target CPU if needed. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx, unsigned long bucket_expiry) { - unsigned long bucket_expiry; + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); - __internal_add_timer(base, timer, &bucket_expiry); + trace_timer_start(timer, timer->expires, timer->flags); trigger_dyntick_cpu(base, timer, bucket_expiry); } +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + unsigned long bucket_expiry; + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); + enqueue_timer(base, timer, idx, bucket_expiry); +} + #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static struct debug_obj_descr timer_debug_descr; @@ -1057,16 +1049,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only - * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise - * we need to (re)calculate the wheel index via - * internal_add_timer(). + * enqueue_timer() is required. Otherwise we need to (re)calculate + * the wheel index via internal_add_timer(). */ - if (idx != UINT_MAX && clk == base->clk) { - enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer, bucket_expiry); - } else { + if (idx != UINT_MAX && clk == base->clk) + enqueue_timer(base, timer, idx, bucket_expiry); + else internal_add_timer(base, timer); - } out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); -- cgit v1.2.3 From 4468897211628865ee2392acb5ad281f74176f63 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:44 +0200 Subject: timers: Add comments about calc_index() ceiling work calc_index() adds 1 unit of the level granularity to the expiry passed in parameter to ensure that the timer doesn't expire too early. Add a comment to explain that and the resulting layout in the wheel. Suggested-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-6-frederic@kernel.org --- kernel/time/timer.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2af08a169564..af1c08b0b168 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -156,7 +156,8 @@ EXPORT_SYMBOL(jiffies_64); /* * The time start value for each level to select the bucket at enqueue - * time. + * time. We start from the last possible delta of the previous level + * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) @@ -490,6 +491,15 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) static inline unsigned calc_index(unsigned long expires, unsigned lvl, unsigned long *bucket_expiry) { + + /* + * The timer wheel has to guarantee that a timer does not fire + * early. Early expiry can happen due to: + * - Timer is armed at the edge of a tick + * - Truncation of the expiry time in the outer wheel levels + * + * Round up with level granularity to prevent this. + */ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); -- cgit v1.2.3 From 001ec1b3925da0d51847c23fc0aa4129282db526 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:45 +0200 Subject: timers: Optimize _next_timer_interrupt() level iteration If a level has a timer that expires before reaching the next level, there is no need to iterate further. The next level is reached when the 3 lower bits of the current level are cleared. If the next event happens before/during that, the next levels won't provide an earlier expiration. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-7-frederic@kernel.org --- kernel/time/timer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index af1c08b0b168..9abc41715fd2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1526,6 +1526,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; @@ -1533,6 +1534,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; + + /* + * If the next expiration happens before we reach + * the next level, no need to check further. + */ + if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) + break; } /* * Clock for the next level. If the current level clock lower @@ -1570,7 +1578,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ - adj = clk & LVL_CLK_MASK ? 1 : 0; + adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } -- cgit v1.2.3 From dc2a0f1fb2a06df09f5094f29aea56b763aa7cca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:46 +0200 Subject: timers: Always keep track of next expiry So far next expiry was only tracked while the CPU was in nohz_idle mode in order to cope with missing ticks that can't increment the base->clk periodically anymore. This logic is going to be expanded beyond nohz in order to spare timer softirqs so do it unconditionally. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-8-frederic@kernel.org --- kernel/time/timer.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9abc41715fd2..76fd9644638b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -544,8 +544,7 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, } static void -trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, - unsigned long bucket_expiry) +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { if (!is_timers_nohz_active()) return; @@ -565,23 +564,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, * timer is not deferrable. If the other CPU is on the way to idle * then it can't set base->is_idle as we hold the base lock: */ - if (!base->is_idle) - return; - - /* - * Check whether this is the new first expiring timer. The - * effective expiry time of the timer is required here - * (bucket_expiry) instead of timer->expires. - */ - if (time_after_eq(bucket_expiry, base->next_expiry)) - return; - - /* - * Set the next expiry time and kick the CPU so it can reevaluate the - * wheel: - */ - base->next_expiry = bucket_expiry; - wake_up_nohz_cpu(base->cpu); + if (base->is_idle) + wake_up_nohz_cpu(base->cpu); } /* @@ -592,12 +576,26 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, static void enqueue_timer(struct timer_base *base, struct timer_list *timer, unsigned int idx, unsigned long bucket_expiry) { + hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); trace_timer_start(timer, timer->expires, timer->flags); - trigger_dyntick_cpu(base, timer, bucket_expiry); + + /* + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. + */ + if (time_before(bucket_expiry, base->next_expiry)) { + /* + * Set the next expiry time and kick the CPU so it + * can reevaluate the wheel: + */ + base->next_expiry = bucket_expiry; + trigger_dyntick_cpu(base, timer); + } } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) @@ -1493,7 +1491,6 @@ static int __collect_expired_timers(struct timer_base *base, return levels; } -#ifdef CONFIG_NO_HZ_COMMON /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level @@ -1585,6 +1582,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) return next; } +#ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: @@ -1790,6 +1788,7 @@ static inline void __run_timers(struct timer_base *base) levels = collect_expired_timers(base, heads); base->clk++; + base->next_expiry = __next_timer_interrupt(base); while (levels--) expire_timers(base, heads + levels); @@ -2042,6 +2041,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } -- cgit v1.2.3 From 90d52f65f303091be17b5f4ffab7090b2064b4a1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:47 +0200 Subject: timers: Reuse next expiry cache after nohz exit Now that the next expiry it tracked unconditionally when a timer is added, this information can be reused on a tick firing after exiting nohz. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-9-frederic@kernel.org --- kernel/time/timer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 76fd9644638b..13f48ee708aa 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1706,13 +1706,11 @@ static int collect_expired_timers(struct timer_base *base, * the next expiring timer. */ if ((long)(now - base->clk) > 2) { - unsigned long next = __next_timer_interrupt(base); - /* * If the next timer is ahead of time forward to current * jiffies, otherwise forward to the next expiry time: */ - if (time_after(next, now)) { + if (time_after(base->next_expiry, now)) { /* * The call site will increment base->clk and then * terminate the expiry loop immediately. @@ -1720,7 +1718,7 @@ static int collect_expired_timers(struct timer_base *base, base->clk = now; return 0; } - base->clk = next; + base->clk = base->next_expiry; } return __collect_expired_timers(base, heads); } -- cgit v1.2.3 From 1f8a4212dc83f8353843fabf6465fd918372fbbf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:48 +0200 Subject: timers: Expand clk forward logic beyond nohz As for next_expiry, the base->clk catch up logic will be expanded beyond NOHZ in order to avoid triggering useless softirqs. If softirqs should only fire to expire pending timers, periodic base->clk increments must be skippable for random amounts of time. Therefore prepare to catch-up with missing updates whenever an up-to-date base clock is needed. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-10-frederic@kernel.org --- kernel/time/timer.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 13f48ee708aa..1be92b53b75f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -888,19 +888,12 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { -#ifdef CONFIG_NO_HZ_COMMON unsigned long jnow; - /* - * We only forward the base when we are idle or have just come out of - * idle (must_forward_clk logic), and have a delta between base clock - * and jiffies. In the common case, run_timers will take care of it. - */ - if (likely(!base->must_forward_clk)) + if (!base->must_forward_clk) return; jnow = READ_ONCE(jiffies); - base->must_forward_clk = base->is_idle; if ((long)(jnow - base->clk) < 2) return; @@ -915,7 +908,6 @@ static inline void forward_timer_base(struct timer_base *base) return; base->clk = base->next_expiry; } -#endif } @@ -1667,10 +1659,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) { - base->must_forward_clk = true; + if ((expires - basem) > TICK_NSEC) base->is_idle = true; - } } raw_spin_unlock(&base->lock); @@ -1769,16 +1759,7 @@ static inline void __run_timers(struct timer_base *base) /* * timer_base::must_forward_clk must be cleared before running * timers so that any timer functions that call mod_timer() will - * not try to forward the base. Idle tracking / clock forwarding - * logic is only used with BASE_STD timers. - * - * The must_forward_clk flag is cleared unconditionally also for - * the deferrable base. The deferrable base is not affected by idle - * tracking and never forwarded, so clearing the flag is a NOOP. - * - * The fact that the deferrable base is never forwarded can cause - * large variations in granularity for deferrable timers, but they - * can be deferred for long periods due to idle anyway. + * not try to forward the base. */ base->must_forward_clk = false; @@ -1791,6 +1772,7 @@ static inline void __run_timers(struct timer_base *base) while (levels--) expire_timers(base, heads + levels); } + base->must_forward_clk = true; raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } -- cgit v1.2.3 From d4f7dae87096dfe722bf32aa82076ece1063746c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:49 +0200 Subject: timers: Spare timer softirq until next expiry Now that the core timer infrastructure doesn't depend anymore on periodic base->clk increments, even when the CPU is not in NO_HZ mode, timer softirqs can be skipped until there are timers to expire. Some spurious softirqs can still remain since base->next_expiry doesn't keep track of canceled timers but this still reduces the number of softirqs significantly: ~15 times less for HZ=1000 and ~5 times less for HZ=100. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-11-frederic@kernel.org --- kernel/time/timer.c | 49 ++++++++----------------------------------------- 1 file changed, 8 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1be92b53b75f..4f78a7bff9e1 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1458,10 +1458,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) } } -static int __collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { - unsigned long clk = base->clk; + unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; @@ -1684,40 +1684,6 @@ void timer_clear_idle(void) */ base->is_idle = false; } - -static int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - unsigned long now = READ_ONCE(jiffies); - - /* - * NOHZ optimization. After a long idle sleep we need to forward the - * base to current jiffies. Avoid a loop by searching the bitfield for - * the next expiring timer. - */ - if ((long)(now - base->clk) > 2) { - /* - * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time: - */ - if (time_after(base->next_expiry, now)) { - /* - * The call site will increment base->clk and then - * terminate the expiry loop immediately. - */ - base->clk = now; - return 0; - } - base->clk = base->next_expiry; - } - return __collect_expired_timers(base, heads); -} -#else -static inline int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - return __collect_expired_timers(base, heads); -} #endif /* @@ -1750,7 +1716,7 @@ static inline void __run_timers(struct timer_base *base) struct hlist_head heads[LVL_DEPTH]; int levels; - if (!time_after_eq(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; timer_base_lock_expiry(base); @@ -1763,7 +1729,8 @@ static inline void __run_timers(struct timer_base *base) */ base->must_forward_clk = false; - while (time_after_eq(jiffies, base->clk)) { + while (time_after_eq(jiffies, base->clk) && + time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); base->clk++; @@ -1798,12 +1765,12 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ - if (time_before(jiffies, base->clk)) { + if (time_before(jiffies, base->next_expiry)) { if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; - if (time_before(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; } raise_softirq(TIMER_SOFTIRQ); -- cgit v1.2.3 From 0975fb565b8b8f9e0c96d0de39fcb954833ea5e0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:50 +0200 Subject: timers: Remove must_forward_clk There is no reason to keep this guard around. The code makes sure that base->clk remains sane and won't be forwarded beyond jiffies nor set backward. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-12-frederic@kernel.org --- kernel/time/timer.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4f78a7bff9e1..8b3fb52d8c47 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -205,7 +205,6 @@ struct timer_base { unsigned long next_expiry; unsigned int cpu; bool is_idle; - bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -888,12 +887,13 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { - unsigned long jnow; + unsigned long jnow = READ_ONCE(jiffies); - if (!base->must_forward_clk) - return; - - jnow = READ_ONCE(jiffies); + /* + * No need to forward if we are close enough below jiffies. + * Also while executing timers, base->clk is 1 offset ahead + * of jiffies to avoid endless requeuing to current jffies. + */ if ((long)(jnow - base->clk) < 2) return; @@ -1722,16 +1722,8 @@ static inline void __run_timers(struct timer_base *base) timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); - /* - * timer_base::must_forward_clk must be cleared before running - * timers so that any timer functions that call mod_timer() will - * not try to forward the base. - */ - base->must_forward_clk = false; - while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { - levels = collect_expired_timers(base, heads); base->clk++; base->next_expiry = __next_timer_interrupt(base); @@ -1739,7 +1731,6 @@ static inline void __run_timers(struct timer_base *base) while (levels--) expire_timers(base, heads + levels); } - base->must_forward_clk = true; raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } @@ -1935,7 +1926,6 @@ int timers_prepare_cpu(unsigned int cpu) base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->is_idle = false; - base->must_forward_clk = true; } return 0; } -- cgit v1.2.3 From 36cd28a4cdd05d47ccb62a2d86e8f37839cc879a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:51 +0200 Subject: timers: Lower base clock forwarding threshold There is nothing that prevents from forwarding the base clock if it's one jiffy off. The reason for this arbitrary limit of two jiffies is historical and does not longer exist. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-13-frederic@kernel.org --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8b3fb52d8c47..77e21e98ec32 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base) * Also while executing timers, base->clk is 1 offset ahead * of jiffies to avoid endless requeuing to current jffies. */ - if ((long)(jnow - base->clk) < 2) + if ((long)(jnow - base->clk) < 1) return; /* -- cgit v1.2.3 From baedb87d1b53532f81b4bd0387f83b05d4f7eb9a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2020 18:00:02 +0200 Subject: genirq/affinity: Handle affinity setting on inactive interrupts correctly Setting interrupt affinity on inactive interrupts is inconsistent when hierarchical irq domains are enabled. The core code should just store the affinity and not call into the irq chip driver for inactive interrupts because the chip drivers may not be in a state to handle such requests. X86 has a hacky workaround for that but all other irq chips have not which causes problems e.g. on GIC V3 ITS. Instead of adding more ugly hacks all over the place, solve the problem in the core code. If the affinity is set on an inactive interrupt then: - Store it in the irq descriptors affinity mask - Update the effective affinity to reflect that so user space has a consistent view - Don't call into the irq chip driver This is the core equivalent of the X86 workaround and works correctly because the affinity setting is established in the irq chip when the interrupt is activated later on. Note, that this is only effective when hierarchical irq domains are enabled by the architecture. Doing it unconditionally would break legacy irq chip implementations. For hierarchial irq domains this works correctly as none of the drivers can have a dependency on affinity setting in inactive state by design. Remove the X86 workaround as it is not longer required. Fixes: 02edee152d6e ("x86/apic/vector: Ignore set_affinity call for inactive interrupts") Reported-by: Ali Saidi Signed-off-by: Thomas Gleixner Tested-by: Ali Saidi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200529015501.15771-1-alisaidi@amazon.com Link: https://lkml.kernel.org/r/877dv2rv25.fsf@nanos.tec.linutronix.de --- kernel/irq/manage.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 761911168438..2a9fec53e159 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -195,9 +195,9 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -205,9 +205,19 @@ static void irq_validate_effective_affinity(struct irq_data *data) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); -#endif } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) +{ + cpumask_copy(irq_data_get_effective_affinity_mask(data), mask); +} +#else +static inline void irq_validate_effective_affinity(struct irq_data *data) { } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) { } +#endif + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -304,6 +314,26 @@ static int irq_try_set_affinity(struct irq_data *data, return ret; } +static bool irq_set_affinity_deactivated(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + /* + * If the interrupt is not yet activated, just store the affinity + * mask and do not call the chip driver at all. On activation the + * driver has to make sure anyway that the interrupt is in a + * useable state so startup works. + */ + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data)) + return false; + + cpumask_copy(desc->irq_common_data.affinity, mask); + irq_init_effective_affinity(data, mask); + irqd_set(data, IRQD_AFFINITY_SET); + return true; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -314,6 +344,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; + if (irq_set_affinity_deactivated(data, mask, force)) + return 0; + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { -- cgit v1.2.3 From 9180bd467f9abdb44afde650d07e3b9dd66d837c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 2 Jul 2020 17:28:40 -0300 Subject: futex: Remove put_futex_key() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 4b39f99c ("futex: Remove {get,drop}_futex_key_refs()"), put_futex_key() is empty. Remove all references for this function and the then redundant labels. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-2-andrealmeid@collabora.com --- kernel/futex.c | 61 ++++++++++++---------------------------------------------- 1 file changed, 12 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e646661f6282..bd9adfca5d51 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -677,10 +677,6 @@ out: return err; } -static inline void put_futex_key(union futex_key *key) -{ -} - /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -1617,7 +1613,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out_put_key; + goto out; spin_lock(&hb->lock); @@ -1640,8 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); out: return ret; } @@ -1712,7 +1706,7 @@ retry: goto out; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out_put_key1; + goto out; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1730,13 +1724,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out_put_keys; + goto out; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out_put_keys; + goto out; } if (!(flags & FLAGS_SHARED)) { @@ -1744,8 +1738,6 @@ retry_private: goto retry_private; } - put_futex_key(&key2); - put_futex_key(&key1); cond_resched(); goto retry; } @@ -1781,10 +1773,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); out: return ret; } @@ -1996,7 +1984,7 @@ retry: ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out_put_key1; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -2004,7 +1992,7 @@ retry: */ if (requeue_pi && match_futex(&key1, &key2)) { ret = -EINVAL; - goto out_put_keys; + goto out; } hb1 = hash_futex(&key1); @@ -2025,13 +2013,11 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out_put_keys; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&key2); - put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2090,8 +2076,6 @@ retry_private: case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; @@ -2106,8 +2090,6 @@ retry_private: */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2217,10 +2199,6 @@ out_unlock: wake_up_q(&wake_q); hb_waiters_dec(hb2); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); out: return ret ? ret : task_count; } @@ -2697,7 +2675,6 @@ retry_private: if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q->key); goto retry; } @@ -2707,8 +2684,6 @@ retry_private: } out: - if (ret) - put_futex_key(&q->key); return ret; } @@ -2853,7 +2828,6 @@ retry_private: * - EAGAIN: The user space value changed. */ queue_unlock(hb); - put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2961,13 +2935,11 @@ no_block: put_pi_state(pi_state); } - goto out_put_key; + goto out; out_unlock_put_key: queue_unlock(hb); -out_put_key: - put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2980,12 +2952,11 @@ uaddr_faulted: ret = fault_in_user_writeable(uaddr); if (ret) - goto out_put_key; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q.key); goto retry; } @@ -3114,16 +3085,13 @@ retry: out_unlock: spin_unlock(&hb->lock); out_putkey: - put_futex_key(&key); return ret; pi_retry: - put_futex_key(&key); cond_resched(); goto retry; pi_faulted: - put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3265,7 +3233,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -3274,7 +3242,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out_put_keys; + goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3284,7 +3252,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out_put_keys; + goto out; /* * In order for us to be here, we know our q.key == key2, and since @@ -3374,11 +3342,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - out: if (to) { hrtimer_cancel(&to->timer); -- cgit v1.2.3 From d7c5ed73b19c4640426d9c106f70ec2cb532034d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 2 Jul 2020 17:28:41 -0300 Subject: futex: Remove needless goto's MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As stated in the coding style documentation, "if there is no cleanup needed then just return directly", instead of jumping to a label and then returning. Remove such goto's and replace with a return statement. When there's a ternary operator on the return value, replace it with the result of the operation when it is logically possible to determine it by the control flow. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-3-andrealmeid@collabora.com --- kernel/futex.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bd9adfca5d51..362fbca6d614 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1607,13 +1607,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out; + return ret; spin_lock(&hb->lock); @@ -1636,7 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out: return ret; } @@ -1703,10 +1702,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out; + return ret; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1724,13 +1723,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out; + return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out; + return ret; } if (!(flags & FLAGS_SHARED)) { @@ -1773,7 +1772,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out: return ret; } @@ -1980,20 +1978,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out; - } + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2013,7 +2009,7 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; @@ -2079,7 +2075,7 @@ retry_private: ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - goto out; + return ret; case -EBUSY: case -EAGAIN: /* @@ -2198,8 +2194,6 @@ out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); - -out: return ret ? ret : task_count; } @@ -2545,7 +2539,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); - goto out; + return ret ? ret : locked; } /* @@ -2558,7 +2552,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner == current) { ret = fixup_pi_state_owner(uaddr, q, NULL); - goto out; + return ret; } /* @@ -2572,8 +2566,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } -out: - return ret ? ret : locked; + return ret; } /** @@ -2670,7 +2663,7 @@ retry_private: ret = get_user(uval, uaddr); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; @@ -2683,7 +2676,6 @@ retry_private: ret = -EWOULDBLOCK; } -out: return ret; } -- cgit v1.2.3 From 9261308598ad28b9a8a2237d881833e9f217244e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 2 Jul 2020 17:28:43 -0300 Subject: futex: Consistently use fshared as boolean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since fshared is only conveying true/false values, declare it as bool. In get_futex_key() the usage of fshared can be restricted to the first part of the function. If fshared is false the function is terminated early and the subsequent code can use a constant 'true' instead of the variable. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-5-andrealmeid@collabora.com --- kernel/futex.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 362fbca6d614..cda91755b77d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -476,7 +476,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -500,8 +500,8 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -538,7 +538,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a again: /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(fshared))) + if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); @@ -626,7 +626,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (unlikely(should_fail_futex(fshared)) || ro) { + if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } -- cgit v1.2.3 From 9a71df495c3d29dab596bb590e73fd8b20106e2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 2 Jul 2020 17:28:42 -0300 Subject: futex: Remove unused or redundant includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 82af7aca ("Removal of FUTEX_FD"), some includes related to file operations aren't needed anymore. More investigation around the includes showed that a lot of includes aren't required for compilation, possible due to redundant includes. Simplify the code by removing unused includes. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-4-andrealmeid@collabora.com --- kernel/futex.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index cda91755b77d..4616d4ad609d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -32,30 +32,13 @@ * "But they come in a choice of three flavours!" */ #include -#include -#include -#include -#include #include -#include -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include #include -#include #include -- cgit v1.2.3 From ce3aa9cc5109363099b7c4ac82e2c9768afcaf31 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:22 +0200 Subject: bpf, netns: Handle multiple link attachments Extend the BPF netns link callbacks to rebuild (grow/shrink) or update the prog_array at given position when link gets attached/updated/released. This let's us lift the limit of having just one link attached for the new attach type introduced by subsequent patch. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200717103536.397595-2-jakub@cloudflare.com --- kernel/bpf/core.c | 55 ++++++++++++++++++++++++++++ kernel/bpf/net_namespace.c | 90 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 136 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9df4cc9a2907..7be02e555ab9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1958,6 +1958,61 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array, } } +/** + * bpf_prog_array_delete_safe_at() - Replaces the program at the given + * index into the program array with + * a dummy no-op program. + * @array: a bpf_prog_array + * @index: the index of the program to replace + * + * Skips over dummy programs, by not counting them, when calculating + * the the position of the program to replace. + * + * Return: + * * 0 - Success + * * -EINVAL - Invalid index value. Must be a non-negative integer. + * * -ENOENT - Index out of range + */ +int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index) +{ + return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog); +} + +/** + * bpf_prog_array_update_at() - Updates the program at the given index + * into the program array. + * @array: a bpf_prog_array + * @index: the index of the program to update + * @prog: the program to insert into the array + * + * Skips over dummy programs, by not counting them, when calculating + * the position of the program to update. + * + * Return: + * * 0 - Success + * * -EINVAL - Invalid index value. Must be a non-negative integer. + * * -ENOENT - Index out of range + */ +int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, + struct bpf_prog *prog) +{ + struct bpf_prog_array_item *item; + + if (unlikely(index < 0)) + return -EINVAL; + + for (item = array->items; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) + continue; + if (!index) { + WRITE_ONCE(item->prog, prog); + return 0; + } + index--; + } + return -ENOENT; +} + int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 310241ca7991..e9c8e26ac8f2 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -36,12 +36,50 @@ static void netns_bpf_run_array_detach(struct net *net, bpf_prog_array_free(run_array); } +static int link_index(struct net *net, enum netns_bpf_attach_type type, + struct bpf_netns_link *link) +{ + struct bpf_netns_link *pos; + int i = 0; + + list_for_each_entry(pos, &net->bpf.links[type], node) { + if (pos == link) + return i; + i++; + } + return -ENOENT; +} + +static int link_count(struct net *net, enum netns_bpf_attach_type type) +{ + struct list_head *pos; + int i = 0; + + list_for_each(pos, &net->bpf.links[type]) + i++; + return i; +} + +static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type, + struct bpf_prog_array *prog_array) +{ + struct bpf_netns_link *pos; + unsigned int i = 0; + + list_for_each_entry(pos, &net->bpf.links[type], node) { + prog_array->items[i].prog = pos->link.prog; + i++; + } +} + static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *old_array, *new_array; struct net *net; + int cnt, idx; mutex_lock(&netns_bpf_mutex); @@ -53,9 +91,27 @@ static void bpf_netns_link_release(struct bpf_link *link) if (!net) goto out_unlock; - netns_bpf_run_array_detach(net, type); + /* Remember link position in case of safe delete */ + idx = link_index(net, type, net_link); list_del(&net_link->node); + cnt = link_count(net, type); + if (!cnt) { + netns_bpf_run_array_detach(net, type); + goto out_unlock; + } + + old_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!new_array) { + WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx)); + goto out_unlock; + } + fill_prog_array(net, type, new_array); + rcu_assign_pointer(net->bpf.run_array[type], new_array); + bpf_prog_array_free(old_array); + out_unlock: mutex_unlock(&netns_bpf_mutex); } @@ -77,7 +133,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *run_array; struct net *net; - int ret = 0; + int idx, ret; if (old_prog && old_prog != link->prog) return -EPERM; @@ -95,7 +151,10 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); - WRITE_ONCE(run_array->items[0].prog, new_prog); + idx = link_index(net, type, net_link); + ret = bpf_prog_array_update_at(run_array, idx, new_prog); + if (ret) + goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); @@ -309,18 +368,28 @@ int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) return ret; } +static int netns_bpf_max_progs(enum netns_bpf_attach_type type) +{ + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + return 1; + default: + return 0; + } +} + static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; - int err; + int cnt, err; mutex_lock(&netns_bpf_mutex); - /* Allow attaching only one prog or link for now */ - if (!list_empty(&net->bpf.links[type])) { + cnt = link_count(net, type); + if (cnt >= netns_bpf_max_progs(type)) { err = -E2BIG; goto out_unlock; } @@ -341,16 +410,19 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; - run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL); if (!run_array) { err = -ENOMEM; goto out_unlock; } - run_array->items[0].prog = link->prog; - rcu_assign_pointer(net->bpf.run_array[type], run_array); list_add_tail(&net_link->node, &net->bpf.links[type]); + fill_prog_array(net, type, run_array); + run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); + out_unlock: mutex_unlock(&netns_bpf_mutex); return err; -- cgit v1.2.3 From e9ddbb7707ff5891616240026062b8c1e29864ca Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:23 +0200 Subject: bpf: Introduce SK_LOOKUP program type with a dedicated attach point Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 5 +++++ kernel/bpf/syscall.c | 9 +++++++++ kernel/bpf/verifier.c | 13 ++++++++++--- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index e9c8e26ac8f2..38b368bccda2 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; + case NETNS_BPF_SK_LOOKUP: + return 64; default: return 0; } @@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; + case NETNS_BPF_SK_LOOKUP: + err = 0; /* nothing to check */ + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ea9dfbebd8c..d07417d17712 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_SK_LOOKUP: + if (expected_attach_type == BPF_SK_LOOKUP) + return 0; + return -EINVAL; case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!capable(CAP_NET_ADMIN)) @@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: return BPF_PROG_TYPE_TRACING; + case BPF_SK_LOOKUP: + return BPF_PROG_TYPE_SK_LOOKUP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: + case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; @@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr) ret = tracing_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c1efc9d08fd..9a6703bc3f36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } meta->ref_obj_id = reg->ref_obj_id; } - } else if (arg_type == ARG_PTR_TO_SOCKET) { + } else if (arg_type == ARG_PTR_TO_SOCKET || + arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { + if (type != expected_type) + goto err_type; + } } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) @@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env) return -ENOTSUPP; } break; + case BPF_PROG_TYPE_SK_LOOKUP: + range = tnum_range(SK_DROP, SK_PASS); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. -- cgit v1.2.3 From 1559b4aa1db443096af493c7d621dc156054babe Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:25 +0200 Subject: inet: Run SK_LOOKUP BPF program on socket lookup Run a BPF program before looking up a listening socket on the receive path. Program selects a listening socket to yield as result of socket lookup by calling bpf_sk_assign() helper and returning SK_PASS code. Program can revert its decision by assigning a NULL socket with bpf_sk_assign(). Alternatively, BPF program can also fail the lookup by returning with SK_DROP, or let the lookup continue as usual with SK_PASS on return, when no socket has been selected with bpf_sk_assign(). This lets the user match packets with listening sockets freely at the last possible point on the receive path, where we know that packets are destined for local delivery after undergoing policing, filtering, and routing. With BPF code selecting the socket, directing packets destined to an IP range or to a port range to a single socket becomes possible. In case multiple programs are attached, they are run in series in the order in which they were attached. The end result is determined from return codes of all the programs according to following rules: 1. If any program returned SK_PASS and selected a valid socket, the socket is used as result of socket lookup. 2. If more than one program returned SK_PASS and selected a socket, last selection takes effect. 3. If any program returned SK_DROP, and no program returned SK_PASS and selected a socket, socket lookup fails with -ECONNREFUSED. 4. If all programs returned SK_PASS and none of them selected a socket, socket lookup continues to htable-based lookup. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 38b368bccda2..4e1bcaa2c3cb 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -25,6 +25,28 @@ struct bpf_netns_link { /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) +{ + switch (type) { + case NETNS_BPF_SK_LOOKUP: + static_branch_dec(&bpf_sk_lookup_enabled); + break; + default: + break; + } +} + +static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) +{ + switch (type) { + case NETNS_BPF_SK_LOOKUP: + static_branch_inc(&bpf_sk_lookup_enabled); + break; + default: + break; + } +} + /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) @@ -91,6 +113,9 @@ static void bpf_netns_link_release(struct bpf_link *link) if (!net) goto out_unlock; + /* Mark attach point as unused */ + netns_bpf_attach_type_unneed(type); + /* Remember link position in case of safe delete */ idx = link_index(net, type, net_link); list_del(&net_link->node); @@ -428,6 +453,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); + /* Mark attach point as used */ + netns_bpf_attach_type_need(type); + out_unlock: mutex_unlock(&netns_bpf_mutex); return err; @@ -503,8 +531,10 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); - list_for_each_entry(net_link, &net->bpf.links[type], node) + list_for_each_entry(net_link, &net->bpf.links[type], node) { net_link->net = NULL; /* auto-detach link */ + netns_bpf_attach_type_unneed(type); + } if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } -- cgit v1.2.3 From 2f9237d4f6df49b74c51cdac555b0a9979d0c237 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 09:30:00 +0200 Subject: dma-mapping: make support for dma ops optional Avoid the overhead of the dma ops support for tiny builds that only use the direct mapping. Signed-off-by: Christoph Hellwig Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- kernel/dma/Kconfig | 4 ++++ kernel/dma/Makefile | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 1da3f44f2565..5cfb2428593a 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -5,6 +5,9 @@ config HAS_DMA depends on !NO_DMA default y +config DMA_OPS + bool + config NEED_SG_DMA_LENGTH bool @@ -60,6 +63,7 @@ config DMA_NONCOHERENT_CACHE_SYNC config DMA_VIRT_OPS bool depends on HAS_DMA + select DMA_OPS config SWIOTLB bool diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 370f63344e9c..32c7c1942bbd 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o +obj-$(CONFIG_HAS_DMA) += mapping.o direct.o +obj-$(CONFIG_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o -- cgit v1.2.3 From d35834c64820c7ef397f8a244061d4450720540e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Mar 2020 18:19:30 +0100 Subject: dma-mapping: add a dma_ops_bypass flag to struct device Several IOMMU drivers have a bypass mode where they can use a direct mapping if the devices DMA mask is large enough. Add generic support to the core dma-mapping code to do that to switch those drivers to a common solution. Signed-off-by: Christoph Hellwig Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- kernel/dma/Kconfig | 8 ++++++ kernel/dma/mapping.c | 74 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 60 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 5cfb2428593a..f4770fcfa62b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -8,6 +8,14 @@ config HAS_DMA config DMA_OPS bool +# +# IOMMU drivers that can bypass the IOMMU code and optionally use the direct +# mapping fast path should select this option and set the dma_ops_bypass +# flag in struct device where applicable +# +config DMA_OPS_BYPASS + bool + config NEED_SG_DMA_LENGTH bool diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b53953024512..0d129421e75f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -105,9 +105,35 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); -static inline bool dma_is_direct(const struct dma_map_ops *ops) +static bool dma_go_direct(struct device *dev, dma_addr_t mask, + const struct dma_map_ops *ops) { - return likely(!ops); + if (likely(!ops)) + return true; +#ifdef CONFIG_DMA_OPS_BYPASS + if (dev->dma_ops_bypass) + return min_not_zero(mask, dev->bus_dma_limit) >= + dma_direct_get_required_mask(dev); +#endif + return false; +} + + +/* + * Check if the devices uses a direct mapping for streaming DMA operations. + * This allows IOMMU drivers to set a bypass mode if the DMA mask is large + * enough. + */ +static inline bool dma_alloc_direct(struct device *dev, + const struct dma_map_ops *ops) +{ + return dma_go_direct(dev, dev->coherent_dma_mask, ops); +} + +static inline bool dma_map_direct(struct device *dev, + const struct dma_map_ops *ops) +{ + return dma_go_direct(dev, *dev->dma_mask, ops); } dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, @@ -118,7 +144,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); @@ -134,7 +160,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_unmap_page(dev, addr, size, dir, attrs); else if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); @@ -153,7 +179,7 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, int ents; BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); @@ -172,7 +198,7 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); debug_dma_unmap_sg(dev, sg, nents, dir); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_unmap_sg(dev, sg, nents, dir, attrs); else if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); @@ -191,7 +217,7 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); else if (ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); @@ -207,7 +233,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (!dma_is_direct(ops) && ops->unmap_resource) + if (!dma_map_direct(dev, ops) && ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); debug_dma_unmap_resource(dev, addr, size, dir); } @@ -219,7 +245,7 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); else if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); @@ -233,7 +259,7 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_device(dev, addr, size, dir); else if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); @@ -247,7 +273,7 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); else if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); @@ -261,7 +287,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_device(dev, sg, nelems, dir); else if (ops->sync_sg_for_device) ops->sync_sg_for_device(dev, sg, nelems, dir); @@ -302,7 +328,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) @@ -372,7 +398,7 @@ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_can_mmap(dev); return ops->mmap != NULL; } @@ -397,7 +423,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) @@ -410,7 +436,7 @@ u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_get_required_mask(dev); if (ops->get_required_mask) return ops->get_required_mask(dev); @@ -441,7 +467,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); @@ -473,7 +499,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); else if (ops->free) ops->free(dev, size, cpu_addr, dma_handle, attrs); @@ -484,7 +510,11 @@ int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + /* + * ->dma_supported sets the bypass flag, so we must always call + * into the method here unless the device is truly direct mapped. + */ + if (!ops) return dma_direct_supported(dev, mask); if (!ops->dma_supported) return 1; @@ -540,7 +570,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) arch_dma_cache_sync(dev, vaddr, size, dir); else if (ops->cache_sync) ops->cache_sync(dev, vaddr, size, dir); @@ -552,7 +582,7 @@ size_t dma_max_mapping_size(struct device *dev) const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) size = dma_direct_max_mapping_size(dev); else if (ops && ops->max_mapping_size) size = ops->max_mapping_size(dev); @@ -565,7 +595,7 @@ bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) return dma_direct_need_sync(dev, dma_addr); return ops->sync_single_for_cpu || ops->sync_single_for_device; } -- cgit v1.2.3 From 23efed6fa7519e29a01f12723b0cfa8118dc87cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jul 2020 17:00:16 +0200 Subject: dma-debug: use named initializers for dir2name Make dir2name a little more readable and maintainable by using named initializers. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/debug.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 36c962a86bf2..4dd0d9028ec8 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -144,8 +144,12 @@ static const char *type2name[] = { [dma_debug_resource] = "resource", }; -static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", - "DMA_FROM_DEVICE", "DMA_NONE" }; +static const char *dir2name[] = { + [DMA_BIDIRECTIONAL] = "DMA_BIDIRECTIONAL", + [DMA_TO_DEVICE] = "DMA_TO_DEVICE", + [DMA_FROM_DEVICE] = "DMA_FROM_DEVICE", + [DMA_NONE] = "DMA_NONE", +}; /* * The access to some variables in this macro is racy. We can't use atomic_t -- cgit v1.2.3 From 1caef81da05a84a40dbf02110e967ce6d1135ff6 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 19 Jul 2020 12:04:12 +0200 Subject: pid: use checkpoint_restore_ns_capable() for set_tid Use the newly introduced capability CAP_CHECKPOINT_RESTORE to allow using clone3() with set_tid set. Signed-off-by: Adrian Reber Signed-off-by: Nicolas Viennot Reviewed-by: Serge Hallyn Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200719100418.2112740-3-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index f1496b757162..450d40469b1c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -198,7 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; - if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } -- cgit v1.2.3 From b9a3db92e1a1f281724e2f34b849d18d1a53bece Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 19 Jul 2020 12:04:13 +0200 Subject: pid_namespace: use checkpoint_restore_ns_capable() for ns_last_pid Use the newly introduced capability CAP_CHECKPOINT_RESTORE to allow writing to ns_last_pid. Signed-off-by: Adrian Reber Signed-off-by: Nicolas Viennot Reviewed-by: Serge Hallyn Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200719100418.2112740-4-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0e5ac162c3a8..ac135bd600eb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, struct ctl_table tmp = *table; int ret, next; - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; /* -- cgit v1.2.3 From ebd6de6812387a2db9a52842cfbe004da1dd3be8 Mon Sep 17 00:00:00 2001 From: Nicolas Viennot Date: Sun, 19 Jul 2020 12:04:15 +0200 Subject: prctl: Allow local CAP_CHECKPOINT_RESTORE to change /proc/self/exe Originally, only a local CAP_SYS_ADMIN could change the exe link, making it difficult for doing checkpoint/restore without CAP_SYS_ADMIN. This commit adds CAP_CHECKPOINT_RESTORE in addition to CAP_SYS_ADMIN for permitting changing the exe link. The following describes the history of the /proc/self/exe permission checks as it may be difficult to understand what decisions lead to this point. * [1] May 2012: This commit introduces the ability of changing /proc/self/exe if the user is CAP_SYS_RESOURCE capable. In the related discussion [2], no clear thread model is presented for what could happen if the /proc/self/exe changes multiple times, or why would the admin be at the mercy of userspace. * [3] Oct 2014: This commit introduces a new API to change /proc/self/exe. The permission no longer checks for CAP_SYS_RESOURCE, but instead checks if the current user is root (uid=0) in its local namespace. In the related discussion [4] it is said that "Controlling exe_fd without privileges may turn out to be dangerous. At least things like tomoyo examine it for making policy decisions (see tomoyo_manager())." * [5] Dec 2016: This commit removes the restriction to change /proc/self/exe at most once. The related discussion [6] informs that the audit subsystem relies on the exe symlink, presumably audit_log_d_path_exe() in kernel/audit.c. * [7] May 2017: This commit changed the check from uid==0 to local CAP_SYS_ADMIN. No discussion. * [8] July 2020: A PoC to spoof any program's /proc/self/exe via ptrace is demonstrated Overall, the concrete points that were made to retain capability checks around changing the exe symlink is that tomoyo_manager() and audit_log_d_path_exe() uses the exe_file path. Christian Brauner said that relying on /proc//exe being immutable (or guarded by caps) in a sake of security is a bit misleading. It can only be used as a hint without any guarantees of what code is being executed once execve() returns to userspace. Christian suggested that in the future, we could call audit_log() or similar to inform the admin of all exe link changes, instead of attempting to provide security guarantees via permission checks. However, this proposed change requires the understanding of the security implications in the tomoyo/audit subsystems. [1] b32dfe377102 ("c/r: prctl: add ability to set new mm_struct::exe_file") [2] https://lore.kernel.org/patchwork/patch/292515/ [3] f606b77f1a9e ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation") [4] https://lore.kernel.org/patchwork/patch/479359/ [5] 3fb4afd9a504 ("prctl: remove one-shot limitation for changing exe link") [6] https://lore.kernel.org/patchwork/patch/697304/ [7] 4d28df6152aa ("prctl: Allow local CAP_SYS_ADMIN changing exe_file") [8] https://github.com/nviennot/run_as_exe Signed-off-by: Nicolas Viennot Signed-off-by: Adrian Reber Link: https://lore.kernel.org/r/20200719100418.2112740-6-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/sys.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 00a96746e28a..a3f4ef0bbda3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2007,11 +2007,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.exe_fd != (u32)-1) { /* - * Make sure the caller has the rights to - * change /proc/pid/exe link: only local sys admin should - * be allowed to. + * Check if the current user is checkpoint/restore capable. + * At the time of this writing, it checks for CAP_SYS_ADMIN + * or CAP_CHECKPOINT_RESTORE. + * Note that a user with access to ptrace can masquerade an + * arbitrary program as any executable, even setuid ones. + * This may have implications in the tomoyo subsystem. */ - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(current_user_ns())) return -EINVAL; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); -- cgit v1.2.3 From 227175b2c914bfa4a9aa5b210c7cabd42c5858ac Mon Sep 17 00:00:00 2001 From: Nicolas Viennot Date: Sun, 19 Jul 2020 12:04:16 +0200 Subject: prctl: exe link permission error changed from -EINVAL to -EPERM This brings consistency with the rest of the prctl() syscall where -EPERM is returned when failing a capability check. Signed-off-by: Nicolas Viennot Signed-off-by: Adrian Reber Reviewed-by: Serge Hallyn Link: https://lore.kernel.org/r/20200719100418.2112740-7-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a3f4ef0bbda3..ca11af9d815d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2015,7 +2015,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * This may have implications in the tomoyo subsystem. */ if (!checkpoint_restore_ns_capable(current_user_ns())) - return -EINVAL; + return -EPERM; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) -- cgit v1.2.3 From 1b86abc1c645ad5c9c7bf70910cb3ce73939d2d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:24 +0800 Subject: sched_clock: Expose struct clock_read_data In order to support perf_event_mmap_page::cap_time features, an architecture needs, aside from a userspace readable counter register, to expose the exact clock data so that userspace can convert the counter register into a correct timestamp. Provide struct clock_read_data and two (seqcount) helpers so that architectures (arm64 in specific) can expose the numbers to userspace. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-2-leo.yan@linaro.org Signed-off-by: Will Deacon --- kernel/time/sched_clock.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fa3f800d7d76..0acaadc3156c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -19,31 +19,6 @@ #include "timekeeping.h" -/** - * struct clock_read_data - data required to read from sched_clock() - * - * @epoch_ns: sched_clock() value at last update - * @epoch_cyc: Clock cycle value at last update. - * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks. - * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. - * @shift: Shift value for scaled math conversion. - * - * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=40 bytes and, when combined - * with the seqcount used to synchronize access, comfortably fits into - * a 64 byte cache line. - */ -struct clock_read_data { - u64 epoch_ns; - u64 epoch_cyc; - u64 sched_clock_mask; - u64 (*read_sched_clock)(void); - u32 mult; - u32 shift; -}; - /** * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) @@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } +struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +{ + *seq = raw_read_seqcount(&cd.seq); + return cd.read_data + (*seq & 1); +} + +int sched_clock_read_retry(unsigned int seq) +{ + return read_seqcount_retry(&cd.seq, seq); +} + unsigned long long notrace sched_clock(void) { u64 cyc, res; @@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void) struct clock_read_data *rd; do { - seq = raw_read_seqcount(&cd.seq); - rd = cd.read_data + (seq & 1); + rd = sched_clock_read_begin(&seq); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (read_seqcount_retry(&cd.seq, seq)); + } while (sched_clock_read_retry(seq)); return res; } -- cgit v1.2.3 From aadd6e5caaacd6feca9691ba30536e7de5a7d152 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 16 Jul 2020 13:11:25 +0800 Subject: time/sched_clock: Use raw_read_seqcount_latch() sched_clock uses seqcount_t latching to switch between two storage places protected by the sequence counter. This allows it to have interruptible, NMI-safe, seqcount_t write side critical sections. Since 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()"), raw_read_seqcount_latch() became the standardized way for seqcount_t latch read paths. Due to the dependent load, it also has one read memory barrier less than the currently used raw_read_seqcount() API. Use raw_read_seqcount_latch() for the seqcount_t latch read path. Signed-off-by: Ahmed S. Darwish Signed-off-by: Leo Yan Link: https://lkml.kernel.org/r/20200625085745.GD117543@hirez.programming.kicks-ass.net Link: https://lkml.kernel.org/r/20200715092345.GA231464@debian-buster-darwi.lab.linutronix.de Link: https://lore.kernel.org/r/20200716051130.4359-3-leo.yan@linaro.org References: 1809bfa44e10 ("timers, sched/clock: Avoid deadlock during read from NMI") Signed-off-by: Will Deacon --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0acaadc3156c..0deaf4b79fb4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -70,7 +70,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { - *seq = raw_read_seqcount(&cd.seq); + *seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } -- cgit v1.2.3 From 4834177e633258fbf3c5754b1220f01c705b79eb Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 9 Jul 2020 01:19:11 -0500 Subject: ima: Support additional conditionals in the KEXEC_CMDLINE hook function Take the properties of the kexec kernel's inode and the current task ownership into consideration when matching a KEXEC_CMDLINE operation to the rules in the IMA policy. This allows for some uniformity when writing IMA policy rules for KEXEC_KERNEL_CHECK, KEXEC_INITRAMFS_CHECK, and KEXEC_CMDLINE operations. Prior to this patch, it was not possible to write a set of rules like this: dont_measure func=KEXEC_KERNEL_CHECK obj_type=foo_t dont_measure func=KEXEC_INITRAMFS_CHECK obj_type=foo_t dont_measure func=KEXEC_CMDLINE obj_type=foo_t measure func=KEXEC_KERNEL_CHECK measure func=KEXEC_INITRAMFS_CHECK measure func=KEXEC_CMDLINE The inode information associated with the kernel being loaded by a kexec_kernel_load(2) syscall can now be included in the decision to measure or not Additonally, the uid, euid, and subj_* conditionals can also now be used in KEXEC_CMDLINE rules. There was no technical reason as to why those conditionals weren't being considered previously other than ima_match_rules() didn't have a valid inode to use so it immediately bailed out for KEXEC_CMDLINE operations rather than going through the full list of conditional comparisons. Signed-off-by: Tyler Hicks Cc: Eric Biederman Cc: kexec@lists.infradead.org Reviewed-by: Lakshmi Ramasubramanian Signed-off-by: Mimi Zohar --- kernel/kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bb05fd52de85..07df431c1f21 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -287,7 +287,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, goto out; } - ima_kexec_cmdline(image->cmdline_buf, + ima_kexec_cmdline(kernel_fd, image->cmdline_buf, image->cmdline_buf_len - 1); } -- cgit v1.2.3 From be619f7f063a49c656f620a46af4f8ea3e759e91 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 13 Jul 2020 12:06:48 -0500 Subject: exec: Implement kernel_execve To allow the kernel not to play games with set_fs to call exec implement kernel_execve. The function kernel_execve takes pointers into kernel memory and copies the values pointed to onto the new userspace stack. The calls with arguments from kernel space of do_execve are replaced with calls to kernel_execve. The calls do_execve and do_execveat are made static as there are now no callers outside of exec. The comments that mention do_execve are updated to refer to kernel_execve or execve depending on the circumstances. In addition to correcting the comments, this makes it easy to grep for do_execve and verify it is not used. Inspired-by: https://lkml.kernel.org/r/20200627072704.2447163-1-hch@lst.de Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/87wo365ikj.fsf@x220.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- kernel/umh.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 6ca2096298b9..a25433f9cd9a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -98,9 +98,9 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + retval = kernel_execve(sub_info->path, + (const char *const *)sub_info->argv, + (const char *const *)sub_info->envp); out: sub_info->retval = retval; /* -- cgit v1.2.3 From f1d9b23cabc61e58509164c3c3132556476491d2 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 13 Jul 2020 15:51:59 -0400 Subject: audit: purge audit_log_string from the intra-kernel audit API audit_log_string() was inteded to be an internal audit function and since there are only two internal uses, remove them. Purge all external uses of it by restructuring code to use an existing audit_log_format() or using audit_log_format(). Please see the upstream issue https://github.com/linux-audit/audit-kernel/issues/84 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8c201f414226..a2f3e34aa724 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2080,13 +2080,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { - audit_log_string(ab, ""); + audit_log_format(ab, "\"\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ - audit_log_string(ab, ""); + audit_log_format(ab, "\"\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); -- cgit v1.2.3 From b43870c74f3fdf0cd06bf5f1b7a5ed70a2cd4ed2 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Sat, 4 Jul 2020 15:15:28 +0000 Subject: audit: report audit wait metric in audit status reply In environments where the preservation of audit events and predictable usage of system memory are prioritized, admins may use a combination of --backlog_wait_time and -b options at the risk of degraded performance resulting from backlog waiting. In some cases, this risk may be preferred to lost events or unbounded memory usage. Ideally, this risk can be mitigated by making adjustments when backlog waiting is detected. However, detection can be difficult using the currently available metrics. For example, an admin attempting to debug degraded performance may falsely believe a full backlog indicates backlog waiting. It may turn out the backlog frequently fills up but drains quickly. To make it easier to reliably track degraded performance to backlog waiting, this patch makes the following changes: Add a new field backlog_wait_time_total to the audit status reply. Initialize this field to zero. Add to this field the total time spent by the current task on scheduled timeouts while the backlog limit is exceeded. Reset field to zero upon request via AUDIT_SET. Tested on Ubuntu 18.04 using complementary changes to the audit-userspace and audit-testsuite: - https://github.com/linux-audit/audit-userspace/pull/134 - https://github.com/linux-audit/audit-testsuite/pull/97 Signed-off-by: Max Englander Signed-off-by: Paul Moore --- kernel/audit.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a2f3e34aa724..d72663ac248c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -136,6 +136,11 @@ u32 audit_sig_sid = 0; */ static atomic_t audit_lost = ATOMIC_INIT(0); +/* Monotonically increasing sum of time the kernel has spent + * waiting while the backlog limit is exceeded. + */ +static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); + /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -1201,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); - s.enabled = audit_enabled; - s.failure = audit_failure; + s.enabled = audit_enabled; + s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ - s.pid = auditd_pid_vnr(); - s.rate_limit = audit_rate_limit; - s.backlog_limit = audit_backlog_limit; - s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_queue); - s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time; + s.pid = auditd_pid_vnr(); + s.rate_limit = audit_rate_limit; + s.backlog_limit = audit_backlog_limit; + s.lost = atomic_read(&audit_lost); + s.backlog = skb_queue_len(&audit_queue); + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; + s.backlog_wait_time = audit_backlog_wait_time; + s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -1315,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_config_change("lost", 0, lost, 1); return lost; } + if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { + u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); + + audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); + return actual; + } break; } case AUDIT_GET_FEATURE: @@ -1826,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { + long rtime = stime; + DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - stime = schedule_timeout(stime); + stime = schedule_timeout(rtime); + atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) -- cgit v1.2.3 From 343ead287dde0064c430ee1db477726f52e0269c Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 21 Jul 2020 12:07:16 +0200 Subject: bpf, netns: Fix build without CONFIG_INET When CONFIG_NET is set but CONFIG_INET isn't, build fails with: ld: kernel/bpf/net_namespace.o: in function `netns_bpf_attach_type_unneed': kernel/bpf/net_namespace.c:32: undefined reference to `bpf_sk_lookup_enabled' ld: kernel/bpf/net_namespace.o: in function `netns_bpf_attach_type_need': kernel/bpf/net_namespace.c:43: undefined reference to `bpf_sk_lookup_enabled' This is because without CONFIG_INET bpf_sk_lookup_enabled symbol is not available. Wrap references to bpf_sk_lookup_enabled with preprocessor conditionals. Fixes: 1559b4aa1db4 ("inet: Run SK_LOOKUP BPF program on socket lookup") Reported-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/bpf/20200721100716.720477-1-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 4e1bcaa2c3cb..71405edd667c 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -28,9 +28,11 @@ DEFINE_MUTEX(netns_bpf_mutex); static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) { switch (type) { +#ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_dec(&bpf_sk_lookup_enabled); break; +#endif default: break; } @@ -39,9 +41,11 @@ static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) { switch (type) { +#ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_inc(&bpf_sk_lookup_enabled); break; +#endif default: break; } -- cgit v1.2.3 From c576b9c77bea9a8ad11ddc277af3edbf15733ddd Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 19 Jul 2020 17:52:41 +0200 Subject: bpf: cpumap: Fix possible rcpu kthread hung Fix the following cpumap kthread hung. The issue is currently occurring when __cpu_map_load_bpf_program fails (e.g if the bpf prog has not BPF_XDP_CPUMAP as expected_attach_type) $./test_progs -n 101 101/1 cpumap_with_progs:OK 101 xdp_cpumap_attach:OK Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED [ 369.996478] INFO: task cpumap/0/map:7:205 blocked for more than 122 seconds. [ 369.998463] Not tainted 5.8.0-rc4-01472-ge57892f50a07 #212 [ 370.000102] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 370.001918] cpumap/0/map:7 D 0 205 2 0x00004000 [ 370.003228] Call Trace: [ 370.003930] __schedule+0x5c7/0xf50 [ 370.004901] ? io_schedule_timeout+0xb0/0xb0 [ 370.005934] ? static_obj+0x31/0x80 [ 370.006788] ? mark_held_locks+0x24/0x90 [ 370.007752] ? cpu_map_bpf_prog_run_xdp+0x6c0/0x6c0 [ 370.008930] schedule+0x6f/0x160 [ 370.009728] schedule_preempt_disabled+0x14/0x20 [ 370.010829] kthread+0x17b/0x240 [ 370.011433] ? kthread_create_worker_on_cpu+0xd0/0xd0 [ 370.011944] ret_from_fork+0x1f/0x30 [ 370.012348] Showing all locks held in the system: [ 370.013025] 1 lock held by khungtaskd/33: [ 370.013432] #0: ffffffff82b24720 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x28/0x1c3 [ 370.014461] ============================================= Fixes: 9216477449f3 ("bpf: cpumap: Add the possibility to attach an eBPF program to cpumap") Reported-by: Jakub Sitnicki Signed-off-by: Lorenzo Bianconi Signed-off-by: Alexei Starovoitov Tested-by: Jakub Sitnicki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/e54f2aabf959f298939e5507b09c48f8c2e380be.1595170625.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 4c95d0615ca2..f1c46529929b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -453,24 +453,27 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) rcpu->map_id = map_id; rcpu->value.qsize = value->qsize; + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; + /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map_id); if (IS_ERR(rcpu->kthread)) - goto free_ptr_ring; + goto free_prog; get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) - goto free_ptr_ring; - /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); return rcpu; +free_prog: + if (rcpu->prog) + bpf_prog_put(rcpu->prog); free_ptr_ring: ptr_ring_cleanup(rcpu->queue, NULL); free_queue: -- cgit v1.2.3 From bc4f0548f683a3d53359cef15f088d2d5bb4bc39 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 20 Jul 2020 09:33:58 -0700 Subject: bpf: Compute bpf_skc_to_*() helper socket btf ids at build time Currently, socket types (struct tcp_sock, udp_sock, etc.) used by bpf_skc_to_*() helpers are computed when vmlinux_btf is first built in the kernel. Commit 5a2798ab32ba ("bpf: Add BTF_ID_LIST/BTF_ID/BTF_ID_UNUSED macros") implemented a mechanism to compute btf_ids at kernel build time which can simplify kernel implementation and reduce runtime overhead by removing in-kernel btf_id calculation. This patch did exactly this, removing in-kernel btf_id computation and utilizing build-time btf_id computation. If CONFIG_DEBUG_INFO_BTF is not defined, BTF_ID_LIST will define an array with size of 5, which is not enough for btf_sock_ids. So define its own static array if CONFIG_DEBUG_INFO_BTF is not defined. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200720163358.1393023-1-yhs@fb.com --- kernel/bpf/btf.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 03d6d43bb1d6..315cde73421b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3672,7 +3672,6 @@ struct btf *btf_parse_vmlinux(void) goto errout; bpf_struct_ops_init(btf, log); - init_btf_sock_ids(btf); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); -- cgit v1.2.3 From 951cf368bcb11d6f817709660cf5cd914072c36f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 20 Jul 2020 09:34:03 -0700 Subject: bpf: net: Use precomputed btf_id for bpf iterators One additional field btf_id is added to struct bpf_ctx_arg_aux to store the precomputed btf_ids. The btf_id is computed at build time with BTF_ID_LIST or BTF_ID_LIST_GLOBAL macro definitions. All existing bpf iterators are changed to used pre-compute btf_ids. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200720163403.1393551-1-yhs@fb.com --- kernel/bpf/btf.c | 5 +++-- kernel/bpf/map_iter.c | 7 ++++++- kernel/bpf/task_iter.c | 12 ++++++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 315cde73421b..ee36b7f60936 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3817,16 +3817,17 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - info->reg_type = PTR_TO_BTF_ID; for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; if (ctx_arg_info->offset == off) { info->reg_type = ctx_arg_info->reg_type; - break; + info->btf_id = ctx_arg_info->btf_id; + return true; } } + info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); if (ret > 0) { diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c69071e334bf..8a7af11b411f 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -4,6 +4,7 @@ #include #include #include +#include struct bpf_iter_seq_map_info { u32 mid; @@ -81,7 +82,10 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; -static const struct bpf_iter_reg bpf_map_reg_info = { +BTF_ID_LIST(btf_bpf_map_id) +BTF_ID(struct, bpf_map) + +static struct bpf_iter_reg bpf_map_reg_info = { .target = "bpf_map", .seq_ops = &bpf_map_seq_ops, .init_seq_private = NULL, @@ -96,6 +100,7 @@ static const struct bpf_iter_reg bpf_map_reg_info = { static int __init bpf_map_iter_init(void) { + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; return bpf_iter_reg_target(&bpf_map_reg_info); } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 4dbf2b6035f8..2feecf095609 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -7,6 +7,7 @@ #include #include #include +#include struct bpf_iter_seq_task_common { struct pid_namespace *ns; @@ -312,7 +313,11 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; -static const struct bpf_iter_reg task_reg_info = { +BTF_ID_LIST(btf_task_file_ids) +BTF_ID(struct, task_struct) +BTF_ID(struct, file) + +static struct bpf_iter_reg task_reg_info = { .target = "task", .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, @@ -325,7 +330,7 @@ static const struct bpf_iter_reg task_reg_info = { }, }; -static const struct bpf_iter_reg task_file_reg_info = { +static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", .seq_ops = &task_file_seq_ops, .init_seq_private = init_seq_pidns, @@ -344,10 +349,13 @@ static int __init task_iter_init(void) { int ret; + task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; return bpf_iter_reg_target(&task_file_reg_info); } late_initcall(task_iter_init); -- cgit v1.2.3 From d136122f58458479fd8926020ba2937de61d7f65 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2020 17:20:21 +0200 Subject: sched: Fix race against ptrace_freeze_trace() There is apparently one site that violates the rule that only current and ttwu() will modify task->state, namely ptrace_{,un}freeze_traced() will change task->state for a remote task. Oleg explains: "TASK_TRACED/TASK_STOPPED was always protected by siglock. In particular, ttwu(__TASK_TRACED) must be always called with siglock held. That is why ptrace_freeze_traced() assumes it can safely do s/TASK_TRACED/__TASK_TRACED/ under spin_lock(siglock)." This breaks the ordering scheme introduced by commit: dbfb089d360b ("sched: Fix loadavg accounting race") Specifically, the reload not matching no longer implies we don't have to block. Simply things by noting that what we need is a LOAD->STORE ordering and this can be provided by a control dependency. So replace: prev_state = prev->state; raw_spin_lock(&rq->lock); smp_mb__after_spinlock(); /* SMP-MB */ if (... && prev_state && prev_state == prev->state) deactivate_task(); with: prev_state = prev->state; if (... && prev_state) /* CTRL-DEP */ deactivate_task(); Since that already implies the 'prev->state' load must be complete before allowing the 'prev->on_rq = 0' store to become visible. Fixes: dbfb089d360b ("sched: Fix loadavg accounting race") Reported-by: Jiri Slaby Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Tested-by: Paul Gortmaker Tested-by: Christian Brauner --- kernel/sched/core.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e15543cb8481..5dece9b34e25 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4119,9 +4119,6 @@ static void __sched notrace __schedule(bool preempt) local_irq_disable(); rcu_note_context_switch(preempt); - /* See deactivate_task() below. */ - prev_state = prev->state; - /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) @@ -4145,11 +4142,16 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; + /* - * We must re-load prev->state in case ttwu_remote() changed it - * before we acquired rq->lock. + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ - if (!preempt && prev_state && prev_state == prev->state) { + prev_state = prev->state; + if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { @@ -4163,10 +4165,12 @@ static void __sched notrace __schedule(bool preempt) /* * __schedule() ttwu() - * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...) - * LOCK rq->lock goto out; - * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); - * p->on_rq = 0; p->state = TASK_WAKING; + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. * * After this, schedule() must not care about p->state any more. */ -- cgit v1.2.3 From 58877d347b58c9e971112df5eb311c13bb0acb28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Jul 2020 14:52:11 +0200 Subject: sched: Better document ttwu() Dave hit the problem fixed by commit: b6e13e85829f ("sched/core: Fix ttwu() race") and failed to understand much of the code involved. Per his request a few comments to (hopefully) clarify things. Requested-by: Dave Chinner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200702125211.GQ4800@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 188 ++++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 10 +++ 2 files changed, 173 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 08d02ce26b71..12db8fbd9c97 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -79,6 +79,100 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; + +/* + * Serialization rules: + * + * Lock order: + * + * p->pi_lock + * rq->lock + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) + * + * rq1->lock + * rq2->lock where: rq1 < rq2 + * + * Regular state: + * + * Normal scheduling state is serialized by rq->lock. __schedule() takes the + * local CPU's rq->lock, it optionally removes the task from the runqueue and + * always looks at the local rq data structures to find the most elegible task + * to run next. + * + * Task enqueue is also under rq->lock, possibly taken from another CPU. + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to + * the local CPU to avoid bouncing the runqueue state around [ see + * ttwu_queue_wakelist() ] + * + * Task wakeup, specifically wakeups that involve migration, are horribly + * complicated to avoid having to take two rq->locks. + * + * Special state: + * + * System-calls and anything external will use task_rq_lock() which acquires + * both p->pi_lock and rq->lock. As a consequence the state they change is + * stable while holding either lock: + * + * - sched_setaffinity()/ + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed + * - set_user_nice(): p->se.load, p->*prio + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, + * p->se.load, p->rt_priority, + * p->dl.dl_{runtime, deadline, period, flags, bw, density} + * - sched_setnuma(): p->numa_preferred_nid + * - sched_move_task()/ + * cpu_cgroup_fork(): p->sched_task_group + * - uclamp_update_active() p->uclamp* + * + * p->state <- TASK_*: + * + * is changed locklessly using set_current_state(), __set_current_state() or + * set_special_state(), see their respective comments, or by + * try_to_wake_up(). This latter uses p->pi_lock to serialize against + * concurrent self. + * + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: + * + * is set by activate_task() and cleared by deactivate_task(), under + * rq->lock. Non-zero indicates the task is runnable, the special + * ON_RQ_MIGRATING state is used for migration without holding both + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). + * + * p->on_cpu <- { 0, 1 }: + * + * is set by prepare_task() and cleared by finish_task() such that it will be + * set before p is scheduled-in and cleared after p is scheduled-out, both + * under rq->lock. Non-zero indicates the task is running on its CPU. + * + * [ The astute reader will observe that it is possible for two tasks on one + * CPU to have ->on_cpu = 1 at the same time. ] + * + * task_cpu(p): is changed by set_task_cpu(), the rules are: + * + * - Don't call set_task_cpu() on a blocked task: + * + * We don't care what CPU we're not running on, this simplifies hotplug, + * the CPU assignment of blocked tasks isn't required to be valid. + * + * - for try_to_wake_up(), called under p->pi_lock: + * + * This allows try_to_wake_up() to only take one rq->lock, see its comment. + * + * - for migration called under rq->lock: + * [ see task_on_rq_migrating() in task_rq_lock() ] + * + * o move_queued_task() + * o detach_task() + * + * - for migration called under double_rq_lock(): + * + * o __migrate_swap_task() + * o push_rt_task() / pull_rt_task() + * o push_dl_task() / pull_dl_task() + * o dl_task_offline_migration() + * + */ + /* * __task_rq_lock - lock the rq @p resides on. */ @@ -1543,8 +1637,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); - dequeue_task(rq, p, DEQUEUE_NOCLOCK); + deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -1552,8 +1645,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); - enqueue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); return rq; @@ -2318,12 +2410,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Consider @p being inside a wait loop: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * between set_current_state() and schedule(). In this case @p is still + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in + * an atomic manner. + * + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq + * then schedule() must still happen and p->state can be changed to + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we + * need to do a full wakeup with enqueue. + * + * Returns: %true when the wakeup is done, + * %false otherwise. */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +static int ttwu_runnable(struct task_struct *p, int wake_flags) { struct rq_flags rf; struct rq *rq; @@ -2464,6 +2575,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } + +#else /* !CONFIG_SMP */ + +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + return false; +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2471,10 +2590,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; -#endif rq_lock(rq, &rf); update_rq_clock(rq); @@ -2530,8 +2647,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * migration. However the means are completely different as there is no lock * chain to provide order. Instead we do: * - * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_load_acquire(!X->on_cpu) + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * Example: * @@ -2571,15 +2688,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * If (@state & @p->state) @p->state = TASK_RUNNING. + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. * * If the task was not queued/runnable, also place it back on a runqueue. * - * Atomic against schedule() which would dequeue a task, also see - * set_current_state(). + * This function is atomic against schedule() which would dequeue the task. + * + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. * - * This function executes a full memory barrier before accessing the task - * state; see set_current_state(). + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. + * + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. @@ -2595,7 +2730,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special - * case the whole 'p->on_rq && ttwu_remote()' case below + * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * * In particular: @@ -2616,8 +2751,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. + * reordered with p->state check below. This pairs with smp_store_mb() + * in set_current_state() that the waiting thread does. */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); @@ -2652,7 +2787,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -3222,8 +3357,10 @@ static inline void prepare_task(struct task_struct *next) /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. + * + * See the ttwu() WF_ON_CPU case and its ordering comment. */ - next->on_cpu = 1; + WRITE_ONCE(next->on_cpu, 1); #endif } @@ -3231,8 +3368,9 @@ static inline void finish_task(struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely * finished. * * In particular, the load of prev->state in finish_task_switch() must diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 65b72e0487bf..9f33c77258ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1203,6 +1203,16 @@ struct rq_flags { #endif }; +/* + * Lockdep annotation that avoids accidental unlocks; it's like a + * sticky/continuous lockdep_assert_held(). + * + * This avoids code that has access to 'struct rq *rq' (basically everything in + * the scheduler) from accidentally unlocking the rq if they do not also have a + * copy of the (on-stack) 'struct rq_flags rf'. + * + * Also see Documentation/locking/lockdep-design.rst. + */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(&rq->lock); -- cgit v1.2.3 From 46132e3ac58cb2ee48051ed80bffc0070ad59b2e Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 1 Jul 2020 14:34:18 -0400 Subject: sched: nohz: stop passing around unused "ticks" parameter. The "ticks" parameter was added in commit 0f004f5a696a ("sched: Cure more NO_HZ load average woes") since calc_global_nohz() was called and needed the "ticks" argument. But in commit c308b56b5398 ("sched: Fix nohz load accounting -- again!") it became unused as the function calc_global_nohz() dropped using "ticks". Fixes: c308b56b5398 ("sched: Fix nohz load accounting -- again!") Signed-off-by: Paul Gortmaker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593628458-32290-1-git-send-email-paul.gortmaker@windriver.com --- kernel/sched/loadavg.c | 2 +- kernel/time/timekeeping.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index de22da666ac7..d2a655643a02 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -347,7 +347,7 @@ static inline void calc_global_nohz(void) { } * * Called from the global timer code. */ -void calc_global_load(unsigned long ticks) +void calc_global_load(void) { unsigned long sample_window; long active, delta; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d20d489841c8..63a632f9896c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2193,7 +2193,7 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64); void do_timer(unsigned long ticks) { jiffies_64 += ticks; - calc_global_load(ticks); + calc_global_load(); } /** -- cgit v1.2.3 From 3edecfef028536cb19a120ec8788bd8a11f93b9e Mon Sep 17 00:00:00 2001 From: Peter Puhov Date: Tue, 14 Jul 2020 08:59:41 -0400 Subject: sched/fair: update_pick_idlest() Select group with lowest group_util when idle_cpus are equal In slow path, when selecting idlest group, if both groups have type group_has_spare, only idle_cpus count gets compared. As a result, if multiple tasks are created in a tight loop, and go back to sleep immediately (while waiting for all tasks to be created), they may be scheduled on the same core, because CPU is back to idle when the new fork happen. For example: sudo perf record -e sched:sched_wakeup_new -- \ sysbench threads --threads=4 run ... total number of events: 61582 ... sudo perf script sysbench 129378 [006] 74586.633466: sched:sched_wakeup_new: sysbench:129380 [120] success=1 CPU:007 sysbench 129378 [006] 74586.634718: sched:sched_wakeup_new: sysbench:129381 [120] success=1 CPU:007 sysbench 129378 [006] 74586.635957: sched:sched_wakeup_new: sysbench:129382 [120] success=1 CPU:007 sysbench 129378 [006] 74586.637183: sched:sched_wakeup_new: sysbench:129383 [120] success=1 CPU:007 This may have negative impact on performance for workloads with frequent creation of multiple threads. In this patch we are using group_util to select idlest group if both groups have equal number of idle_cpus. Comparing the number of idle cpu is not enough in this case, because the newly forked thread sleeps immediately and before we select the cpu for the next one. This is shown in the trace where the same CPU7 is selected for all wakeup_new events. That's why, looking at utilization when there is the same number of CPU is a good way to see where the previous task was placed. Using nr_running doesn't solve the problem because the newly forked task is not running and the cpu would not have been idle in this case and an idle CPU would have been selected instead. With this patch newly created tasks would be better distributed. With this patch: sudo perf record -e sched:sched_wakeup_new -- \ sysbench threads --threads=4 run ... total number of events: 74401 ... sudo perf script sysbench 129455 [006] 75232.853257: sched:sched_wakeup_new: sysbench:129457 [120] success=1 CPU:008 sysbench 129455 [006] 75232.854489: sched:sched_wakeup_new: sysbench:129458 [120] success=1 CPU:009 sysbench 129455 [006] 75232.855732: sched:sched_wakeup_new: sysbench:129459 [120] success=1 CPU:010 sysbench 129455 [006] 75232.856980: sched:sched_wakeup_new: sysbench:129460 [120] success=1 CPU:011 We tested this patch with following benchmarks: master: 'commit b3a9e3b9622a ("Linux 5.8-rc1")' 100 iterations of: perf bench -f simple futex wake -s -t 128 -w 1 Lower result is better | | BASELINE | +PATCH | DELTA (%) | |---------|------------|----------|-------------| | mean | 0.33 | 0.313 | +5.152 | | std (%) | 10.433 | 7.563 | | 100 iterations of: sysbench threads --threads=8 run Higher result is better | | BASELINE | +PATCH | DELTA (%) | |---------|------------|----------|-------------| | mean | 5235.02 | 5863.73 | +12.01 | | std (%) | 8.166 | 10.265 | | 100 iterations of: sysbench mutex --mutex-num=1 --threads=8 run Lower result is better | | BASELINE | +PATCH | DELTA (%) | |---------|------------|----------|-------------| | mean | 0.413 | 0.404 | +2.179 | | std (%) | 3.791 | 1.816 | | Signed-off-by: Peter Puhov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200714125941.4174-1-peter.puhov@linaro.org --- kernel/sched/fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 98a53a2fe354..2ba8f230feb9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8711,8 +8711,14 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_has_spare: /* Select group with most idle CPUs */ - if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + if (idlest_sgs->idle_cpus > sgs->idle_cpus) return false; + + /* Select group with lowest group_util */ + if (idlest_sgs->idle_cpus == sgs->idle_cpus && + idlest_sgs->group_util <= sgs->group_util) + return false; + break; } -- cgit v1.2.3 From 589343569d7b58a64ec2446e6686c8e79aea7fcf Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 16 Jul 2020 15:04:57 +0800 Subject: smp: Fix a potential usage of stale nr_cpus The get_option() maybe return 0, it means that the nr_cpus is not initialized. Then we will use the stale nr_cpus to initialize the nr_cpu_ids. So fix it. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716070457.53255-1-songmuchun@bytedance.com --- kernel/smp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index aa17eedff5be..d0ae8eb6bf8b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -634,8 +634,7 @@ static int __init nrcpus(char *str) { int nr_cpus; - get_option(&str, &nr_cpus); - if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) nr_cpu_ids = nr_cpus; return 0; -- cgit v1.2.3 From 25980c7a79af42f2daa73e2f475ebf4cbac8253e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sun, 12 Jul 2020 17:59:15 +0100 Subject: arch_topology, sched/core: Cleanup thermal pressure definition The following commit: 14533a16c46d ("thermal/cpu-cooling, sched/core: Move the arch_set_thermal_pressure() API to generic scheduler code") moved the definition of arch_set_thermal_pressure() to sched/core.c, but kept its declaration in linux/arch_topology.h. When building e.g. an x86 kernel with CONFIG_SCHED_THERMAL_PRESSURE=y, cpufreq_cooling.c ends up getting the declaration of arch_set_thermal_pressure() from include/linux/arch_topology.h, which is somewhat awkward. On top of this, sched/core.c unconditionally defines o The thermal_pressure percpu variable o arch_set_thermal_pressure() while arch_scale_thermal_pressure() does nothing unless redefined by the architecture. arch_*() functions are meant to be defined by architectures, so revert the aforementioned commit and re-implement it in a way that keeps arch_set_thermal_pressure() architecture-definable, and doesn't define the thermal pressure percpu variable for kernels that don't need it (CONFIG_SCHED_THERMAL_PRESSURE=n). Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200712165917.9168-2-valentin.schneider@arm.com --- kernel/sched/core.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 12db8fbd9c97..bd8e5211d31f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3869,17 +3869,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); - -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure) -{ - int cpu; - - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. -- cgit v1.2.3 From bd25b4886ddcebec92591f298ce2ce345d7f2ea9 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:51 -0400 Subject: padata: remove start function padata_start() is only used right after pcrypt allocates an instance with all possible CPUs, when PADATA_INVALID can't happen, so there's no need for a separate "start" step. It can be done during allocation to save text, make using padata easier, and avoid unneeded calls in the future. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 4373f7adaa40..931762316612 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -789,30 +789,6 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - * - * Return: 0 on success or negative error code - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err = -EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - /** * padata_stop - stop the parallel processing * @@ -1100,7 +1076,7 @@ static struct padata_instance *padata_alloc(const char *name, if (padata_setup_cpumasks(pinst)) goto err_free_rcpumask_cbcpu; - pinst->flags = 0; + __padata_start(pinst); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); -- cgit v1.2.3 From 350ef051d4edd884e8dea0be9f3685b4b32142fb Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:52 -0400 Subject: padata: remove stop function padata_stop() has two callers and is unnecessary in both cases. When pcrypt calls it before padata_free(), it's being unloaded so there are no outstanding padata jobs[0]. When __padata_free() calls it, it's either along the same path or else pcrypt initialization failed, which of course means there are also no outstanding jobs. Removing it simplifies padata and saves text. [0] https://lore.kernel.org/linux-crypto/20191119225017.mjrak2fwa5vccazl@gondor.apana.org.au/ Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 931762316612..8f55e717ba50 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -789,19 +789,6 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - #ifdef CONFIG_HOTPLUG_CPU static int __padata_add_cpu(struct padata_instance *pinst, int cpu) @@ -883,7 +870,6 @@ static void __padata_free(struct padata_instance *pinst) WARN_ON(!list_empty(&pinst->pslist)); - padata_stop(pinst); free_cpumask_var(pinst->rcpumask.cbcpu); free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); -- cgit v1.2.3 From cec00e6e120f4f0743d2bf0638418684c74b16e0 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:53 -0400 Subject: padata: inline single call of pd_setup_cpumasks() pd_setup_cpumasks() has only one caller. Move its contents inline to prepare for the next cleanup. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 8f55e717ba50..27f90a3c4dc6 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -441,28 +441,6 @@ static int padata_setup_cpumasks(struct padata_instance *pinst) return err; } -static int pd_setup_cpumasks(struct parallel_data *pd, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) -{ - int err = -ENOMEM; - - if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) - goto free_pcpu_mask; - - cpumask_copy(pd->cpumask.pcpu, pcpumask); - cpumask_copy(pd->cpumask.cbcpu, cbcpumask); - - return 0; - -free_pcpu_mask: - free_cpumask_var(pd->cpumask.pcpu); -out: - return err; -} - static void __init padata_mt_helper(struct work_struct *w) { struct padata_work *pw = container_of(w, struct padata_work, pw_work); @@ -613,8 +591,14 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) goto err_free_pqueue; pd->ps = ps; - if (pd_setup_cpumasks(pd, pcpumask, cbcpumask)) + + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) goto err_free_squeue; + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) + goto err_free_pcpu; + + cpumask_copy(pd->cpumask.pcpu, pcpumask); + cpumask_copy(pd->cpumask.cbcpu, cbcpumask); padata_init_pqueues(pd); padata_init_squeues(pd); @@ -626,6 +610,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) return pd; +err_free_pcpu: + free_cpumask_var(pd->cpumask.pcpu); err_free_squeue: free_percpu(pd->squeue); err_free_pqueue: -- cgit v1.2.3 From d69e037bcc4a7e31fdd40ae416aa1bd768dd7d99 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:54 -0400 Subject: padata: remove effective cpumasks from the instance A padata instance has effective cpumasks that store the user-supplied masks ANDed with the online mask, but this middleman is unnecessary. parallel_data keeps the same information around. Removing this saves text and code churn in future changes. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 27f90a3c4dc6..4f0a57e5738c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -571,13 +571,8 @@ static void padata_init_pqueues(struct parallel_data *pd) static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) { struct padata_instance *pinst = ps->pinst; - const struct cpumask *cbcpumask; - const struct cpumask *pcpumask; struct parallel_data *pd; - cbcpumask = pinst->rcpumask.cbcpu; - pcpumask = pinst->rcpumask.pcpu; - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); if (!pd) goto err; @@ -597,8 +592,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) goto err_free_pcpu; - cpumask_copy(pd->cpumask.pcpu, pcpumask); - cpumask_copy(pd->cpumask.cbcpu, cbcpumask); + cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); + cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); padata_init_pqueues(pd); padata_init_squeues(pd); @@ -668,12 +663,6 @@ static int padata_replace(struct padata_instance *pinst) pinst->flags |= PADATA_RESET; - cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, - cpu_online_mask); - - cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, - cpu_online_mask); - list_for_each_entry(ps, &pinst->pslist, list) { err = padata_replace_one(ps); if (err) @@ -856,8 +845,6 @@ static void __padata_free(struct padata_instance *pinst) WARN_ON(!list_empty(&pinst->pslist)); - free_cpumask_var(pinst->rcpumask.cbcpu); - free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); destroy_workqueue(pinst->serial_wq); @@ -1033,20 +1020,13 @@ static struct padata_instance *padata_alloc(const char *name, !padata_validate_cpumask(pinst, cbcpumask)) goto err_free_masks; - if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL)) - goto err_free_masks; - if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL)) - goto err_free_rcpumask_pcpu; - INIT_LIST_HEAD(&pinst->pslist); cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask); - cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask); if (padata_setup_cpumasks(pinst)) - goto err_free_rcpumask_cbcpu; + goto err_free_masks; __padata_start(pinst); @@ -1064,10 +1044,6 @@ static struct padata_instance *padata_alloc(const char *name, return pinst; -err_free_rcpumask_cbcpu: - free_cpumask_var(pinst->rcpumask.cbcpu); -err_free_rcpumask_pcpu: - free_cpumask_var(pinst->rcpumask.pcpu); err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); -- cgit v1.2.3 From 3f257191d31d5eaf154ebdb696efc238837ddd51 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:55 -0400 Subject: padata: fold padata_alloc_possible() into padata_alloc() There's no reason to have two interfaces when there's only one caller. Removing _possible saves text and simplifies future changes. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 4f0a57e5738c..1c0b97891edb 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -979,18 +979,12 @@ static struct kobj_type padata_attr_type = { }; /** - * padata_alloc - allocate and initialize a padata instance and specify - * cpumasks for serial and parallel workers. - * + * padata_alloc - allocate and initialize a padata instance * @name: used to identify the instance - * @pcpumask: cpumask that will be used for padata parallelization - * @cbcpumask: cpumask that will be used for padata serialization * * Return: new instance on success, NULL on error */ -static struct padata_instance *padata_alloc(const char *name, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +struct padata_instance *padata_alloc(const char *name) { struct padata_instance *pinst; @@ -1016,14 +1010,11 @@ static struct padata_instance *padata_alloc(const char *name, free_cpumask_var(pinst->cpumask.pcpu); goto err_free_serial_wq; } - if (!padata_validate_cpumask(pinst, pcpumask) || - !padata_validate_cpumask(pinst, cbcpumask)) - goto err_free_masks; INIT_LIST_HEAD(&pinst->pslist); - cpumask_copy(pinst->cpumask.pcpu, pcpumask); - cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + cpumask_copy(pinst->cpumask.pcpu, cpu_possible_mask); + cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask); if (padata_setup_cpumasks(pinst)) goto err_free_masks; @@ -1057,21 +1048,7 @@ err_free_inst: err: return NULL; } - -/** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @name: used to identify the instance - * - * Return: new instance on success, NULL on error - */ -struct padata_instance *padata_alloc_possible(const char *name) -{ - return padata_alloc(name, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); +EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance -- cgit v1.2.3 From f601c725a6ac072608f47083e7c8115a3f504f95 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:56 -0400 Subject: padata: remove padata_parallel_queue Only its reorder field is actually used now, so remove the struct and embed @reorder directly in parallel_data. No functional change, just a cleanup. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 1c0b97891edb..16cb894dc272 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -250,13 +250,11 @@ EXPORT_SYMBOL(padata_do_parallel); static struct padata_priv *padata_find_next(struct parallel_data *pd, bool remove_object) { - struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; int cpu = pd->cpu; - next_queue = per_cpu_ptr(pd->pqueue, cpu); - reorder = &next_queue->reorder; + reorder = per_cpu_ptr(pd->reorder_list, cpu); spin_lock(&reorder->lock); if (list_empty(&reorder->list)) { @@ -291,7 +289,7 @@ static void padata_reorder(struct parallel_data *pd) int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; - struct padata_parallel_queue *next_queue; + struct padata_list *reorder; /* * We need to ensure that only one cpu can work on dequeueing of @@ -339,9 +337,8 @@ static void padata_reorder(struct parallel_data *pd) */ smp_mb(); - next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); - if (!list_empty(&next_queue->reorder.list) && - padata_find_next(pd, false)) + reorder = per_cpu_ptr(pd->reorder_list, pd->cpu); + if (!list_empty(&reorder->list) && padata_find_next(pd, false)) queue_work(pinst->serial_wq, &pd->reorder_work); } @@ -401,17 +398,16 @@ void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); - struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - hashed_cpu); + struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; - spin_lock(&pqueue->reorder.lock); + spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ - list_for_each_entry_reverse(cur, &pqueue->reorder.list, list) + list_for_each_entry_reverse(cur, &reorder->list, list) if (cur->seq_nr < padata->seq_nr) break; list_add(&padata->list, &cur->list); - spin_unlock(&pqueue->reorder.lock); + spin_unlock(&reorder->lock); /* * Ensure the addition to the reorder list is ordered correctly @@ -553,17 +549,15 @@ static void padata_init_squeues(struct parallel_data *pd) } } -/* Initialize all percpu queues used by parallel workers */ -static void padata_init_pqueues(struct parallel_data *pd) +/* Initialize per-CPU reorder lists */ +static void padata_init_reorder_list(struct parallel_data *pd) { int cpu; - struct padata_parallel_queue *pqueue; + struct padata_list *list; for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - - __padata_list_init(&pqueue->reorder); - atomic_set(&pqueue->num_obj, 0); + list = per_cpu_ptr(pd->reorder_list, cpu); + __padata_list_init(list); } } @@ -577,13 +571,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) if (!pd) goto err; - pd->pqueue = alloc_percpu(struct padata_parallel_queue); - if (!pd->pqueue) + pd->reorder_list = alloc_percpu(struct padata_list); + if (!pd->reorder_list) goto err_free_pd; pd->squeue = alloc_percpu(struct padata_serial_queue); if (!pd->squeue) - goto err_free_pqueue; + goto err_free_reorder_list; pd->ps = ps; @@ -595,7 +589,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); - padata_init_pqueues(pd); + padata_init_reorder_list(pd); padata_init_squeues(pd); pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); @@ -609,8 +603,8 @@ err_free_pcpu: free_cpumask_var(pd->cpumask.pcpu); err_free_squeue: free_percpu(pd->squeue); -err_free_pqueue: - free_percpu(pd->pqueue); +err_free_reorder_list: + free_percpu(pd->reorder_list); err_free_pd: kfree(pd); err: @@ -621,7 +615,7 @@ static void padata_free_pd(struct parallel_data *pd) { free_cpumask_var(pd->cpumask.pcpu); free_cpumask_var(pd->cpumask.cbcpu); - free_percpu(pd->pqueue); + free_percpu(pd->reorder_list); free_percpu(pd->squeue); kfree(pd); } -- cgit v1.2.3 From 072e133d554bb74929f902582c8cf66d9fd12771 Mon Sep 17 00:00:00 2001 From: Peter Enderborg Date: Thu, 16 Jul 2020 09:15:10 +0200 Subject: tracefs: Remove unnecessary debug_fs checks. This is a preparation for debugfs restricted mode. We don't need debugfs to trace, the removed check stop tracefs to work if debugfs is not initialised. We instead tries to automount within debugfs and relay on it's handling. The code path is to create a backward compatibility from when tracefs was part of debugfs, it is now standalone and does not need debugfs. When debugfs is in restricted it is compiled in but not active and return EPERM to clients and tracefs wont work if it assumes it is active it is compiled in kernel. Reported-by: kernel test robot Signed-off-by: Peter Enderborg Reviewed-by: Greg Kroah-Hartman Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20200716071511.26864-2-peter.enderborg@sony.com Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..848f67a5f16d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8945,9 +8945,7 @@ struct dentry *tracing_init_dentry(void) if (tr->dir) return NULL; - if (WARN_ON(!tracefs_initialized()) || - (IS_ENABLED(CONFIG_DEBUG_FS) && - WARN_ON(!debugfs_initialized()))) + if (WARN_ON(!tracefs_initialized())) return ERR_PTR(-ENODEV); /* -- cgit v1.2.3 From 31cd0e119d50cf27ebe214d1a8f7ca36692f13a5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 Jul 2020 17:16:41 +0200 Subject: timers: Recalculate next timer interrupt only when necessary The nohz tick code recalculates the timer wheel's next expiry on each idle loop iteration. On the other hand, the base next expiry is now always cached and updated upon timer enqueue and execution. Only timer dequeue may leave base->next_expiry out of date (but then its stale value won't ever go past the actual next expiry to be recalculated). Since recalculating the next_expiry isn't a free operation, especially when the last wheel level is reached to find out that no timer has been enqueued at all, reuse the next expiry cache when it is known to be reliable, which it is most of the time. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200723151641.12236-1-frederic@kernel.org --- kernel/time/timer.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 77e21e98ec32..96d802e9769e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -204,6 +204,7 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; + bool next_expiry_recalc; bool is_idle; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; @@ -593,6 +594,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, * can reevaluate the wheel: */ base->next_expiry = bucket_expiry; + base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); } } @@ -836,8 +838,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, if (!timer_pending(timer)) return 0; - if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); + base->next_expiry_recalc = true; + } detach_timer(timer, clear_pending); return 1; @@ -1571,6 +1575,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk >>= LVL_CLK_SHIFT; clk += adj; } + + base->next_expiry_recalc = false; + return next; } @@ -1631,9 +1638,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return expires; raw_spin_lock(&base->lock); - nextevt = __next_timer_interrupt(base); + if (base->next_expiry_recalc) + base->next_expiry = __next_timer_interrupt(base); + nextevt = base->next_expiry; is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); - base->next_expiry = nextevt; + /* * We have a fresh next event. Check whether we can forward the * base. We can only do that when @basej is past base->clk @@ -1725,6 +1734,12 @@ static inline void __run_timers(struct timer_base *base) while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); + /* + * The only possible reason for not finding any expired + * timer at this clk is that all matching timers have been + * dequeued. + */ + WARN_ON_ONCE(!levels && !base->next_expiry_recalc); base->clk++; base->next_expiry = __next_timer_interrupt(base); -- cgit v1.2.3 From 062d3f95b630113e1156a31f376ad36e25da29a7 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 23 Jul 2020 21:10:42 +0100 Subject: sched: Warn if garbage is passed to default_wake_function() Since the default_wake_function() passes its flags onto try_to_wake_up(), warn if those flags collide with internal values. Given that the supplied flags are garbage, no repair can be done but at least alert the user to the damage they are causing. In the belief that these errors should be picked up during testing, the warning is only compiled in under CONFIG_SCHED_DEBUG. Signed-off-by: Chris Wilson Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200723201042.18861-1-chris@chris-wilson.co.uk --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5dece9b34e25..2142c6767682 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4485,6 +4485,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); -- cgit v1.2.3 From 142781e108b13b2b0e8f035cfb5bfbbc8f14d887 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 23:59:56 +0200 Subject: entry: Provide generic syscall entry functionality On syscall entry certain work needs to be done: - Establish state (lockdep, context tracking, tracing) - Conditional work (ptrace, seccomp, audit...) This code is needlessly duplicated and different in all architectures. Provide a generic version based on the x86 implementation which has all the RCU and instrumentation bits right. As interrupt/exception entry from user space needs parts of the same functionality, provide a function for this as well. syscall_enter_from_user_mode() and irqentry_enter_from_user_mode() must be called right after the low level ASM entry. The calling code must be non-instrumentable. After the functions returns state is correct and the subsequent functions can be instrumented. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220519.513463269@linutronix.de --- kernel/Makefile | 1 + kernel/entry/Makefile | 12 +++++++ kernel/entry/common.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 kernel/entry/Makefile create mode 100644 kernel/entry/common.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..fde2000d0d0d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,6 +48,7 @@ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ +obj-y += entry/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile new file mode 100644 index 000000000000..c207d202bf3a --- /dev/null +++ b/kernel/entry/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Prevent the noinstr section from being pestered by sanitizer and other goodies +# as long as these things cannot be disabled per function. +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong +CFLAGS_common.o += -fno-stack-protector + +obj-$(CONFIG_GENERIC_ENTRY) += common.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c new file mode 100644 index 000000000000..1d636dee2fec --- /dev/null +++ b/kernel/entry/common.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +/** + * enter_from_user_mode - Establish state when coming from user mode + * + * Syscall/interrupt entry disables interrupts, but user mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + */ +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + arch_check_user_regs(regs); + lockdep_hardirqs_off(CALLER_ADDR0); + + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +static long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long ti_work) +{ + long ret = 0; + + /* Handle ptrace */ + if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + ret = arch_syscall_enter_tracehook(regs); + if (ret || (ti_work & _TIF_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (ti_work & _TIF_SECCOMP) { + ret = __secure_computing(NULL); + if (ret == -1L) + return ret; + } + + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, syscall); + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + unsigned long ti_work; + + enter_from_user_mode(regs); + instrumentation_begin(); + + local_irq_enable(); + ti_work = READ_ONCE(current_thread_info()->flags); + if (ti_work & SYSCALL_ENTER_WORK) + syscall = syscall_trace_enter(regs, syscall, ti_work); + instrumentation_end(); + + return syscall; +} + +noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); +} -- cgit v1.2.3 From a9f3a74a29af095f3e1b89e9176f8127912ae0f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 23:59:57 +0200 Subject: entry: Provide generic syscall exit function Like syscall entry all architectures have similar and pointlessly different code to handle pending work before returning from a syscall to user space. 1) One-time syscall exit work: - rseq syscall exit - audit - syscall tracing - tracehook (single stepping) 2) Preparatory work - Exit to user mode loop (common TIF handling). - Architecture specific one time work arch_exit_to_user_mode_prepare() - Address limit and lockdep checks 3) Final transition (lockdep, tracing, context tracking, RCU). Invokes arch_exit_to_user_mode() to handle e.g. speculation mitigations Provide a generic version based on the x86 code which has all the RCU and instrumentation protections right. Provide a variant for interrupt return to user mode as well which shares the above #2 and #3 work items. After syscall_exit_to_user_mode() and irqentry_exit_to_user_mode() the architecture code just has to return to user space. The code after returning from these functions must not be instrumented. Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220519.613977173@linutronix.de --- kernel/entry/common.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 1d636dee2fec..0a051bb91a56 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,6 +2,8 @@ #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -82,7 +84,174 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) return syscall; } +/** + * exit_to_user_mode - Fixup state when exiting to user mode + * + * Syscall/interupt exit enables interrupts, but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Invoke architecture specific last minute exit code, e.g. speculation + * mitigations, etc. + * 4) Tell lockdep that interrupts are enabled + */ +static __always_inline void exit_to_user_mode(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + user_enter_irqoff(); + arch_exit_to_user_mode(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* Workaround to allow gradual conversion of architecture code */ +void __weak arch_do_signal(struct pt_regs *regs) { } + +static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) +{ + /* + * Before returning to user space ensure that all pending work + * items have been completed. + */ + while (ti_work & EXIT_TO_USER_MODE_WORK) { + + local_irq_enable_exit_to_user(ti_work); + + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + + if (ti_work & _TIF_UPROBE) + uprobe_notify_resume(regs); + + if (ti_work & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + + if (ti_work & _TIF_SIGPENDING) + arch_do_signal(regs); + + if (ti_work & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); + } + + /* Architecture specific TIF work */ + arch_exit_to_user_mode_work(regs, ti_work); + + /* + * Disable interrupts and reevaluate the work flags as they + * might have changed while interrupts and preemption was + * enabled above. + */ + local_irq_disable_exit_to_user(); + ti_work = READ_ONCE(current_thread_info()->flags); + } + + /* Return the latest work state for arch_exit_to_user_mode() */ + return ti_work; +} + +static void exit_to_user_mode_prepare(struct pt_regs *regs) +{ + unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + + lockdep_assert_irqs_disabled(); + + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + + arch_exit_to_user_mode_prepare(regs, ti_work); + + /* Ensure that the address limit is intact and no locks are held */ + addr_limit_user_check(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); +} + +#ifndef _TIF_SINGLESTEP +static inline bool report_single_step(unsigned long ti_work) +{ + return false; +} +#else +/* + * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_usermode(). + */ +#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) + +static inline bool report_single_step(unsigned long ti_work) +{ + return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; +} +#endif + +static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) +{ + bool step; + + audit_syscall_exit(regs); + + if (ti_work & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(ti_work); + if (step || ti_work & _TIF_SYSCALL_TRACE) + arch_syscall_exit_tracehook(regs, step); +} + +/* + * Syscall specific exit to user mode preparation. Runs with interrupts + * enabled. + */ +static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + u32 cached_flags = READ_ONCE(current_thread_info()->flags); + unsigned long nr = syscall_get_nr(current, regs); + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) + local_irq_enable(); + } + + rseq_syscall(regs); + + /* + * Do one-time syscall specific work. If these work items are + * enabled, we want to run them exactly once per syscall exit with + * interrupts enabled. + */ + if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) + syscall_exit_work(regs, cached_flags); +} + +__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + syscall_exit_to_user_mode_prepare(regs); + local_irq_disable_exit_to_user(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} + noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); } + +noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} -- cgit v1.2.3 From a5497bab5f72dce38a259a53fd3ac1239a7ecf40 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 23:59:58 +0200 Subject: entry: Provide generic interrupt entry/exit code Like the syscall entry/exit code interrupt/exception entry after the real low level ASM bits should not be different accross architectures. Provide a generic version based on the x86 code. irqentry_enter() is called after the low level entry code and irqentry_exit() must be invoked right before returning to the low level code which just contains the actual return logic. The code before irqentry_enter() and irqentry_exit() must not be instrumented. Code after irqentry_enter() and before irqentry_exit() can be instrumented. irqentry_enter() invokes irqentry_enter_from_user_mode() if the interrupt/exception came from user mode. If if entered from kernel mode it handles the kernel mode variant of establishing state for lockdep, RCU and tracing depending on the kernel context it interrupted (idle, non-idle). Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200722220519.723703209@linutronix.de --- kernel/entry/common.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 0a051bb91a56..495f5c051b03 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -255,3 +255,120 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) instrumentation_end(); exit_to_user_mode(); } + +irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs) +{ + irqentry_state_t ret = { + .exit_rcu = false, + }; + + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + return ret; + } + + /* + * If this entry hit the idle task invoke rcu_irq_enter() whether + * RCU is watching or not. + * + * Interupts can nest when the first interrupt invokes softirq + * processing on return which enables interrupts. + * + * Scheduler ticks in the idle task can mark quiescent state and + * terminate a grace period, if and only if the timer interrupt is + * not nested into another interrupt. + * + * Checking for __rcu_is_watching() here would prevent the nesting + * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * the tick then rcu_flavor_sched_clock_irq() would wrongfully + * assume that it is the first interupt and eventually claim + * quiescient state and end grace periods prematurely. + * + * Unconditionally invoke rcu_irq_enter() so RCU state stays + * consistent. + * + * TINY_RCU does not support EQS, so let the compiler eliminate + * this part when enabled. + */ + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in irq_enter_from_user_mode(). + */ + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); + + ret.exit_rcu = true; + return ret; + } + + /* + * If RCU is watching then RCU only wants to check whether it needs + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() + * already contains a warning when RCU is not watching, so no point + * in having another one here. + */ + instrumentation_begin(); + rcu_irq_enter_check_tick(); + /* Use the combo lockdep/tracing function */ + trace_hardirqs_off(); + instrumentation_end(); + + return ret; +} + +void irqentry_exit_cond_resched(void) +{ + if (!preempt_count()) { + /* Sanity check RCU and thread stack */ + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); + if (need_resched()) + preempt_schedule_irq(); + } +} + +void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + /* Check whether this returns to user mode */ + if (user_mode(regs)) { + irqentry_exit_to_user_mode(regs); + } else if (!regs_irqs_disabled(regs)) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (state.exit_rcu) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + rcu_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); + } else { + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ + if (state.exit_rcu) + rcu_irq_exit(); + } +} -- cgit v1.2.3 From 935ace2fb5cc49ae88bd1f1735ddc51cdc2ebfb3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 23:59:59 +0200 Subject: entry: Provide infrastructure for work before transitioning to guest mode Entering a guest is similar to exiting to user space. Pending work like handling signals, rescheduling, task work etc. needs to be handled before that. Provide generic infrastructure to avoid duplication of the same handling code all over the place. The transfer to guest mode handling is different from the exit to usermode handling, e.g. vs. rseq and live patching, so a separate function is used. The initial list of work items handled is: TIF_SIGPENDING, TIF_NEED_RESCHED, TIF_NOTIFY_RESUME Architecture specific TIF flags can be added via defines in the architecture specific include files. The calling convention is also different from the syscall/interrupt entry functions as KVM invokes this from the outer vcpu_run() loop with interrupts and preemption enabled. To prevent missing a pending work item it invokes a check for pending TIF work from interrupt disabled code right before transitioning to guest mode. The lockdep, RCU and tracing state handling is also done directly around the switch to and from guest mode. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200722220519.833296398@linutronix.de --- kernel/entry/Makefile | 3 ++- kernel/entry/kvm.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 kernel/entry/kvm.c (limited to 'kernel') diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index c207d202bf3a..34c8a3f1c735 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -9,4 +9,5 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c new file mode 100644 index 000000000000..eb1a8a4c867c --- /dev/null +++ b/kernel/entry/kvm.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +{ + do { + int ret; + + if (ti_work & _TIF_SIGPENDING) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + + if (ti_work & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(NULL); + } + + ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + if (ret) + return ret; + + ti_work = READ_ONCE(current_thread_info()->flags); + } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); + return 0; +} + +int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +{ + unsigned long ti_work; + + /* + * This is invoked from the outer guest loop with interrupts and + * preemption enabled. + * + * KVM invokes xfer_to_guest_mode_work_pending() with interrupts + * disabled in the inner loop before going into guest mode. No need + * to disable interrupts here. + */ + ti_work = READ_ONCE(current_thread_info()->flags); + if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) + return 0; + + return xfer_to_guest_mode_work(vcpu, ti_work); +} +EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); -- cgit v1.2.3 From fe5ed7ab99c656bd2f5b79b49df0e9ebf2cead8a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jul 2020 17:44:20 +0200 Subject: uprobes: Change handle_swbp() to send SIGTRAP with si_code=SI_KERNEL, to fix GDB regression If a tracee is uprobed and it hits int3 inserted by debugger, handle_swbp() does send_sig(SIGTRAP, current, 0) which means si_code == SI_USER. This used to work when this code was written, but then GDB started to validate si_code and now it simply can't use breakpoints if the tracee has an active uprobe: # cat test.c void unused_func(void) { } int main(void) { return 0; } # gcc -g test.c -o test # perf probe -x ./test -a unused_func # perf record -e probe_test:unused_func gdb ./test -ex run GNU gdb (GDB) 10.0.50.20200714-git ... Program received signal SIGTRAP, Trace/breakpoint trap. 0x00007ffff7ddf909 in dl_main () from /lib64/ld-linux-x86-64.so.2 (gdb) The tracee hits the internal breakpoint inserted by GDB to monitor shared library events but GDB misinterprets this SIGTRAP and reports a signal. Change handle_swbp() to use force_sig(SIGTRAP), this matches do_int3_user() and fixes the problem. This is the minimal fix for -stable, arch/x86/kernel/uprobes.c is equally wrong; it should use send_sigtrap(TRAP_TRACE) instead of send_sig(SIGTRAP), but this doesn't confuse GDB and needs another x86-specific patch. Reported-by: Aaron Merey Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Reviewed-by: Srikar Dronamraju Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200723154420.GA32043@redhat.com --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0862873dba..5f8b0c52fd2e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2199,7 +2199,7 @@ static void handle_swbp(struct pt_regs *regs) if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't -- cgit v1.2.3 From e5ebffe18e5add85acb23c7a0f74509749967204 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 19 Jul 2020 17:10:45 -0600 Subject: dyndbg: rename __verbose section to __dyndbg dyndbg populates its callsite info into __verbose section, change that to a more specific and descriptive name, __dyndbg. Also, per checkpatch: simplify __attribute(..) to __section(__dyndbg) declaration. and 1 spelling fix, decriptor Acked-by: Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20200719231058.1586423-6-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index aa183c9ac0a2..e7b4ff7e4fd0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3237,7 +3237,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) if (section_addr(info, "__obsparm")) pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - info->debug = section_objs(info, "__verbose", + info->debug = section_objs(info, "__dyndbg", sizeof(*info->debug), &info->num_debug); return 0; -- cgit v1.2.3 From 13efa616124f7eec7d6a58adeeef31864aa03879 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Sat, 25 Jul 2020 16:56:29 +0800 Subject: sched/uclamp: Remove unnecessary mutex_init() The uclamp_mutex lock is initialized statically via DEFINE_MUTEX(), it is unnecessary to initialize it runtime via mutex_init(). Signed-off-by: Qinglang Miao Signed-off-by: Ingo Molnar Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Dietmar Eggemann Link: https://lore.kernel.org/r/20200725085629.98292-1-miaoqinglang@huawei.com --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bd8e5211d31f..678253450ebb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1425,8 +1425,6 @@ static void __init init_uclamp(void) enum uclamp_id clamp_id; int cpu; - mutex_init(&uclamp_mutex); - for_each_possible_cpu(cpu) init_uclamp_rq(cpu_rq(cpu)); -- cgit v1.2.3 From a7ef9b28aa8d72a1656fa6f0a01bbd1493886317 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 25 Jul 2020 19:51:10 +0100 Subject: locking/lockdep: Fix overflow in presentation of average lock-time Though the number of lock-acquisitions is tracked as unsigned long, this is passed as the divisor to div_s64() which interprets it as a s32, giving nonsense values with more than 2 billion acquisitons. E.g. acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg ------------------------------------------------------------------------- 2350439395 0.07 353.38 649647067.36 0.-32 Signed-off-by: Chris Wilson Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20200725185110.11588-1-chris@chris-wilson.co.uk --- kernel/locking/lockdep_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 5525cd3ba0c8..02ef87f50df2 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); - seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); + seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) -- cgit v1.2.3 From 3f9969f2c040ba2ba635b6b5a7051f404bcc634d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Jul 2020 12:51:56 -0700 Subject: bpf: Fix pos computation for bpf_iter seq_ops->start() Currently, the pos pointer in bpf iterator map/task/task_file seq_ops->start() is always incremented. This is incorrect. It should be increased only if *pos is 0 (for SEQ_START_TOKEN) since these start() function actually returns the first real object. If *pos is not 0, it merely found the object based on the state in seq->private, and not really advancing the *pos. This patch fixed this issue by only incrementing *pos if it is 0. Note that the old *pos calculation, although not correct, does not affect correctness of bpf_iter as bpf_iter seq_file->read() does not support llseek. This patch also renamed "mid" in bpf_map iterator seq_file private data to "map_id" for better clarity. Fixes: 6086d29def80 ("bpf: Add bpf_map iterator") Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722195156.4029817-1-yhs@fb.com --- kernel/bpf/map_iter.c | 16 ++++++---------- kernel/bpf/task_iter.c | 6 ++++-- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8a7af11b411f..5926c76d854e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -7,7 +7,7 @@ #include struct bpf_iter_seq_map_info { - u32 mid; + u32 map_id; }; static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) @@ -15,27 +15,23 @@ static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_map_info *info = seq->private; struct bpf_map *map; - map = bpf_map_get_curr_or_next(&info->mid); + map = bpf_map_get_curr_or_next(&info->map_id); if (!map) return NULL; - ++*pos; + if (*pos == 0) + ++*pos; return map; } static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_map_info *info = seq->private; - struct bpf_map *map; ++*pos; - ++info->mid; + ++info->map_id; bpf_map_put((struct bpf_map *)v); - map = bpf_map_get_curr_or_next(&info->mid); - if (!map) - return NULL; - - return map; + return bpf_map_get_curr_or_next(&info->map_id); } struct bpf_iter__bpf_map { diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 2feecf095609..1039e52ebd8b 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -51,7 +51,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) if (!task) return NULL; - ++*pos; + if (*pos == 0) + ++*pos; return task; } @@ -210,7 +211,8 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) return NULL; } - ++*pos; + if (*pos == 0) + ++*pos; info->task = task; info->files = files; -- cgit v1.2.3 From a228a64fc1e4428e2b96dc68e9ad3c447095c9e7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Jul 2020 18:10:18 -0700 Subject: bpf: Add bpf_prog iterator It's mostly a copy paste of commit 6086d29def80 ("bpf: Add bpf_map iterator") that is use to implement bpf_seq_file opreations to traverse all bpf programs. v1->v2: Tweak to use build time btf_id Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Daniel Borkmann --- kernel/bpf/Makefile | 2 +- kernel/bpf/prog_iter.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 19 +++++++++ 3 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/prog_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1131a921e1a6..e6eb9c0402da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c new file mode 100644 index 000000000000..6541b577d69f --- /dev/null +++ b/kernel/bpf/prog_iter.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include + +struct bpf_iter_seq_prog_info { + u32 prog_id; +}; + +static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_prog_info *info = seq->private; + struct bpf_prog *prog; + + prog = bpf_prog_get_curr_or_next(&info->prog_id); + if (!prog) + return NULL; + + if (*pos == 0) + ++*pos; + return prog; +} + +static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_prog_info *info = seq->private; + + ++*pos; + ++info->prog_id; + bpf_prog_put((struct bpf_prog *)v); + return bpf_prog_get_curr_or_next(&info->prog_id); +} + +struct bpf_iter__bpf_prog { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_prog *, prog); +}; + +DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog) + +static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_prog ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.prog = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_prog_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_prog_seq_show(seq, v, false); +} + +static void bpf_prog_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_prog_seq_show(seq, v, true); + else + bpf_prog_put((struct bpf_prog *)v); +} + +static const struct seq_operations bpf_prog_seq_ops = { + .start = bpf_prog_seq_start, + .next = bpf_prog_seq_next, + .stop = bpf_prog_seq_stop, + .show = bpf_prog_seq_show, +}; + +BTF_ID_LIST(btf_bpf_prog_id) +BTF_ID(struct, bpf_prog) + +static struct bpf_iter_reg bpf_prog_reg_info = { + .target = "bpf_prog", + .seq_ops = &bpf_prog_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_prog, prog), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_prog_iter_init(void) +{ + bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id; + return bpf_iter_reg_target(&bpf_prog_reg_info); +} + +late_initcall(bpf_prog_iter_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d07417d17712..ee290b1f2d9e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3044,6 +3044,25 @@ again: return map; } +struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) +{ + struct bpf_prog *prog; + + spin_lock_bh(&prog_idr_lock); +again: + prog = idr_get_next(&prog_idr, id); + if (prog) { + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&prog_idr_lock); + + return prog; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) -- cgit v1.2.3 From 14fc6bd6b79c430f615500d0fe6cea4722110db8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:09 -0700 Subject: bpf: Refactor bpf_iter_reg to have separate seq_info member There is no functionality change for this patch. Struct bpf_iter_reg is used to register a bpf_iter target, which includes information for both prog_load, link_create and seq_file creation. This patch puts fields related seq_file creation into a different structure. This will be useful for map elements iterator where one iterator covers different map types and different map types may have different seq_ops, init/fini private_data function and private_data size. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184109.590030-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 12 ++++++------ kernel/bpf/map_iter.c | 8 ++++++-- kernel/bpf/prog_iter.c | 8 ++++++-- kernel/bpf/task_iter.c | 16 ++++++++++++---- 4 files changed, 30 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dd612b80b9fe..5b2387d6aa1f 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -218,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->reg_info->fini_seq_private) - iter_priv->tinfo->reg_info->fini_seq_private(seq->private); + if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private) + iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -433,16 +433,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->reg_info->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + tinfo->reg_info->seq_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->reg_info->init_seq_private) { - err = tinfo->reg_info->init_seq_private(priv_data->target_private); + if (tinfo->reg_info->seq_info->init_seq_private) { + err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private); if (err) goto release_seq_file; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 5926c76d854e..1a69241fb1e2 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -81,17 +81,21 @@ static const struct seq_operations bpf_map_seq_ops = { BTF_ID_LIST(btf_bpf_map_id) BTF_ID(struct, bpf_map) -static struct bpf_iter_reg bpf_map_reg_info = { - .target = "bpf_map", +static const struct bpf_iter_seq_info bpf_map_seq_info = { .seq_ops = &bpf_map_seq_ops, .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + +static struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map, map), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &bpf_map_seq_info, }; static int __init bpf_map_iter_init(void) diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c index 6541b577d69f..53a73c841c13 100644 --- a/kernel/bpf/prog_iter.c +++ b/kernel/bpf/prog_iter.c @@ -81,17 +81,21 @@ static const struct seq_operations bpf_prog_seq_ops = { BTF_ID_LIST(btf_bpf_prog_id) BTF_ID(struct, bpf_prog) -static struct bpf_iter_reg bpf_prog_reg_info = { - .target = "bpf_prog", +static const struct bpf_iter_seq_info bpf_prog_seq_info = { .seq_ops = &bpf_prog_seq_ops, .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info), +}; + +static struct bpf_iter_reg bpf_prog_reg_info = { + .target = "bpf_prog", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_prog, prog), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &bpf_prog_seq_info, }; static int __init bpf_prog_iter_init(void) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 1039e52ebd8b..6d9cd23869bf 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -319,25 +319,32 @@ BTF_ID_LIST(btf_task_file_ids) BTF_ID(struct, task_struct) BTF_ID(struct, file) -static struct bpf_iter_reg task_reg_info = { - .target = "task", +static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static struct bpf_iter_reg task_reg_info = { + .target = "task", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &task_seq_info, }; -static struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", +static const struct bpf_iter_seq_info task_file_seq_info = { .seq_ops = &task_file_seq_ops, .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + +static struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__task_file, task), @@ -345,6 +352,7 @@ static struct bpf_iter_reg task_file_reg_info = { { offsetof(struct bpf_iter__task_file, file), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &task_file_seq_info, }; static int __init task_iter_init(void) -- cgit v1.2.3 From f9c792729581bd8b8473af163e8ab426c2c61d89 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:10 -0700 Subject: bpf: Refactor to provide aux info to bpf_iter_init_seq_priv_t This patch refactored target bpf_iter_init_seq_priv_t callback function to accept additional information. This will be needed in later patches for map element targets since a particular map should be passed to traverse elements for that particular map. In the future, other information may be passed to target as well, e.g., pid, cgroup id, etc. to customize the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184110.590156-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 2 +- kernel/bpf/task_iter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5b2387d6aa1f..8fa94cb1b5a0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -442,7 +442,7 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) } if (tinfo->reg_info->seq_info->init_seq_private) { - err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private); + err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL); if (err) goto release_seq_file; } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 6d9cd23869bf..232df29793e9 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -293,7 +293,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v) } } -static int init_seq_pidns(void *priv_data) +static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_task_common *common = priv_data; -- cgit v1.2.3 From afbf21dce668ef59482037596eaffbe5041e094c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:11 -0700 Subject: bpf: Support readonly/readwrite buffers in verifier Readonly and readwrite buffer register states are introduced. Totally four states, PTR_TO_RDONLY_BUF[_OR_NULL] and PTR_TO_RDWR_BUF[_OR_NULL] are supported. As suggested by their respective names, PTR_TO_RDONLY_BUF[_OR_NULL] are for readonly buffers and PTR_TO_RDWR_BUF[_OR_NULL] for read/write buffers. These new register states will be used by later bpf map element iterator. New register states share some similarity to PTR_TO_TP_BUFFER as it will calculate accessed buffer size during verification time. The accessed buffer size will be later compared to other metrics during later attach/link_create time. Similar to reg_state PTR_TO_BTF_ID_OR_NULL in bpf iterator programs, PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL reg_types can be set at prog->aux->bpf_ctx_arg_aux, and bpf verifier will retrieve the values during btf_ctx_access(). Later bpf map element iterator implementation will show how such information will be assigned during target registeration time. The verifier is also enhanced such that PTR_TO_RDONLY_BUF can be passed to ARG_PTR_TO_MEM[_OR_NULL] helper argument, and PTR_TO_RDWR_BUF can be passed to ARG_PTR_TO_MEM[_OR_NULL] or ARG_PTR_TO_UNINIT_MEM. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184111.590274-1-yhs@fb.com --- kernel/bpf/btf.c | 13 ++++++++ kernel/bpf/verifier.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 98 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ee36b7f60936..0fd6bb62be3a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3806,6 +3806,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } + + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off && + (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || + ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + info->reg_type = ctx_arg_info->reg_type; + return true; + } + } + if (t->type == 0) /* This is a pointer to void. * It is the same as scalar from the verifier safety pov. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a6703bc3f36..8d6979db48d8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -409,7 +409,9 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL; + type == PTR_TO_MEM_OR_NULL || + type == PTR_TO_RDONLY_BUF_OR_NULL || + type == PTR_TO_RDWR_BUF_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -503,6 +505,10 @@ static const char * const reg_type_str[] = { [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", + [PTR_TO_RDONLY_BUF] = "rdonly_buf", + [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", + [PTR_TO_RDWR_BUF] = "rdwr_buf", + [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", }; static char slot_type_char[] = { @@ -2173,6 +2179,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: case PTR_TO_BTF_ID_OR_NULL: + case PTR_TO_RDONLY_BUF: + case PTR_TO_RDONLY_BUF_OR_NULL: + case PTR_TO_RDWR_BUF: + case PTR_TO_RDWR_BUF_OR_NULL: return true; default: return false; @@ -3052,14 +3062,15 @@ int check_ctx_reg(struct bpf_verifier_env *env, return 0; } -static int check_tp_buffer_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int regno, int off, int size) +static int __check_buffer_access(struct bpf_verifier_env *env, + const char *buf_info, + const struct bpf_reg_state *reg, + int regno, int off, int size) { if (off < 0) { verbose(env, - "R%d invalid tracepoint buffer access: off=%d, size=%d", - regno, off, size); + "R%d invalid %s buffer access: off=%d, size=%d", + regno, buf_info, off, size); return -EACCES; } if (!tnum_is_const(reg->var_off) || reg->var_off.value) { @@ -3071,12 +3082,45 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, regno, off, tn_buf); return -EACCES; } + + return 0; +} + +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + int err; + + err = __check_buffer_access(env, "tracepoint", reg, regno, off, size); + if (err) + return err; + if (off + size > env->prog->aux->max_tp_access) env->prog->aux->max_tp_access = off + size; return 0; } +static int check_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size, + bool zero_size_allowed, + const char *buf_info, + u32 *max_access) +{ + int err; + + err = __check_buffer_access(env, buf_info, reg, regno, off, size); + if (err) + return err; + + if (off + size > *max_access) + *max_access = off + size; + + return 0; +} + /* BPF architecture zero extends alu32 ops into 64-bit registesr */ static void zext_32_to_64(struct bpf_reg_state *reg) { @@ -3427,6 +3471,23 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); + } else if (reg->type == PTR_TO_RDONLY_BUF) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); + return -EACCES; + } + err = check_buffer_access(env, reg, regno, off, size, "rdonly", + false, + &env->prog->aux->max_rdonly_access); + if (!err && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_RDWR_BUF) { + err = check_buffer_access(env, reg, regno, off, size, "rdwr", + false, + &env->prog->aux->max_rdwr_access); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -3668,6 +3729,18 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); + case PTR_TO_RDONLY_BUF: + if (meta && meta->raw_mode) + return -EACCES; + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, + "rdonly", + &env->prog->aux->max_rdonly_access); + case PTR_TO_RDWR_BUF: + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, + "rdwr", + &env->prog->aux->max_rdwr_access); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3933,6 +4006,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != PTR_TO_MEM && + type != PTR_TO_RDONLY_BUF && + type != PTR_TO_RDWR_BUF && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; @@ -6806,6 +6881,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_BTF_ID; } else if (reg->type == PTR_TO_MEM_OR_NULL) { reg->type = PTR_TO_MEM; + } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { + reg->type = PTR_TO_RDONLY_BUF; + } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { + reg->type = PTR_TO_RDWR_BUF; } if (is_null) { /* We don't need id and ref_obj_id from this point -- cgit v1.2.3 From a5cbe05a6673b85bed2a63ffcfea6a96c6410cff Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:12 -0700 Subject: bpf: Implement bpf iterator for map elements The bpf iterator for map elements are implemented. The bpf program will receive four parameters: bpf_iter_meta *meta: the meta data bpf_map *map: the bpf_map whose elements are traversed void *key: the key of one element void *value: the value of the same element Here, meta and map pointers are always valid, and key has register type PTR_TO_RDONLY_BUF_OR_NULL and value has register type PTR_TO_RDWR_BUF_OR_NULL. The kernel will track the access range of key and value during verification time. Later, these values will be compared against the values in the actual map to ensure all accesses are within range. A new field iter_seq_info is added to bpf_map_ops which is used to add map type specific information, i.e., seq_ops, init/fini seq_file func and seq_file private data size. Subsequent patches will have actual implementation for bpf_map_ops->iter_seq_info. In user space, BPF_ITER_LINK_MAP_FD needs to be specified in prog attr->link_create.flags, which indicates that attr->link_create.target_fd is a map_fd. The reason for such an explicit flag is for possible future cases where one bpf iterator may allow more than one possible customization, e.g., pid and cgroup id for task_file. Current kernel internal implementation only allows the target to register at most one required bpf_iter_link_info. To support the above case, optional bpf_iter_link_info's are needed, the target can be extended to register such link infos, and user provided link_info needs to match one of target supported ones. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 85 +++++++++++++++++++++++++++++++++++++++++---------- kernel/bpf/map_iter.c | 30 +++++++++++++++++- 2 files changed, 98 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8fa94cb1b5a0..363b9cafc2d8 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -14,11 +14,13 @@ struct bpf_iter_target_info { struct bpf_iter_link { struct bpf_link link; + struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; + const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; @@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -199,11 +202,25 @@ done: return copied; } +static const struct bpf_iter_seq_info * +__get_seq_info(struct bpf_iter_link *link) +{ + const struct bpf_iter_seq_info *seq_info; + + if (link->aux.map) { + seq_info = link->aux.map->ops->iter_seq_info; + if (seq_info) + return seq_info; + } + + return link->tinfo->reg_info->seq_info; +} + static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link); + return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) @@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private) - iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private); + if (iter_priv->seq_info->fini_seq_private) + iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) static void bpf_iter_link_release(struct bpf_link *link) { + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + if (iter_link->aux.map) + bpf_map_put_with_uref(iter_link->aux.map); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -370,14 +392,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; + struct bpf_iter_aux_info aux = {}; struct bpf_iter_link *link; + u32 prog_btf_id, target_fd; bool existed = false; - u32 prog_btf_id; + struct bpf_map *map; int err; - if (attr->link_create.target_fd || attr->link_create.flags) - return -EINVAL; - prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -390,6 +411,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; + /* Make sure user supplied flags are target expected. */ + target_fd = attr->link_create.target_fd; + if (attr->link_create.flags != tinfo->reg_info->req_linfo) + return -EINVAL; + if (!attr->link_create.flags && target_fd) + return -EINVAL; + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -403,21 +431,45 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { + map = bpf_map_get_with_uref(target_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto cleanup_link; + } + + aux.map = map; + err = tinfo->reg_info->check_target(prog, &aux); + if (err) { + bpf_map_put_with_uref(map); + goto cleanup_link; + } + + link->aux.map = map; + } + return bpf_link_settle(&link_primer); + +cleanup_link: + bpf_link_cleanup(&link_primer); + return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, + const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; + priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; @@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->reg_info->seq_info->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops, + seq_info->seq_priv_size; + priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->reg_info->seq_info->init_seq_private) { - err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL); + if (seq_info->init_seq_private) { + err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } - init_seq_meta(priv_data, tinfo, prog); + init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; @@ -463,6 +515,7 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { + struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; @@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link) goto free_fd; } - err = prepare_seq_file(file, - container_of(link, struct bpf_iter_link, link)); + iter_link = container_of(link, struct bpf_iter_link, link); + err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 1a69241fb1e2..8a1f9b3355d0 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,10 +98,38 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; +static int bpf_iter_check_map(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux) +{ + return -EINVAL; +} + +DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, void *value) + +static const struct bpf_iter_reg bpf_map_elem_reg_info = { + .target = "bpf_map_elem", + .check_target = bpf_iter_check_map, + .req_linfo = BPF_ITER_LINK_MAP_FD, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map_elem, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__bpf_map_elem, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, +}; + static int __init bpf_map_iter_init(void) { + int ret; + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; - return bpf_iter_reg_target(&bpf_map_reg_info); + ret = bpf_iter_reg_target(&bpf_map_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&bpf_map_elem_reg_info); } late_initcall(bpf_map_iter_init); -- cgit v1.2.3 From d6c4503cc29638f328e1a6e6fefbdbda401c28fc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:14 -0700 Subject: bpf: Implement bpf iterator for hash maps The bpf iterators for hash, percpu hash, lru hash and lru percpu hash are implemented. During link time, bpf_iter_reg->check_target() will check map type and ensure the program access key/value region is within the map defined key/value size limit. For percpu hash and lru hash maps, the bpf program will receive values for all cpus. The map element bpf iterator infrastructure will prepare value properly before passing the value pointer to the bpf program. This patch set supports readonly map keys and read/write map values. It does not support deleting map elements, e.g., from hash tables. If there is a user case for this, the following mechanism can be used to support map deletion for hashtab, etc. - permit a new bpf program return value, e.g., 2, to let bpf iterator know the map element should be removed. - since bucket lock is taken, the map element will be queued. - once bucket lock is released after all elements under this bucket are traversed, all to-be-deleted map elements can be deleted. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184114.590470-1-yhs@fb.com --- kernel/bpf/hashtab.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/map_iter.c | 24 ++++++- 2 files changed, 217 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d4378d7d442b..024276787055 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1612,6 +1612,196 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, true, false); } +struct bpf_iter_seq_hash_map_info { + struct bpf_map *map; + struct bpf_htab *htab; + void *percpu_value_buf; // non-zero means percpu hash + unsigned long flags; + u32 bucket_id; + u32 skip_elems; +}; + +static struct htab_elem * +bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, + struct htab_elem *prev_elem) +{ + const struct bpf_htab *htab = info->htab; + unsigned long flags = info->flags; + u32 skip_elems = info->skip_elems; + u32 bucket_id = info->bucket_id; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + struct bucket *b; + u32 i, count; + + if (bucket_id >= htab->n_buckets) + return NULL; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + /* no update/deletion on this bucket, prev_elem should be still valid + * and we won't skip elements. + */ + n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); + elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); + if (elem) + return elem; + + /* not found, unlock and go to the next bucket */ + b = &htab->buckets[bucket_id++]; + htab_unlock_bucket(htab, b, flags); + skip_elems = 0; + } + + for (i = bucket_id; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + flags = htab_lock_bucket(htab, b); + + count = 0; + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + if (count >= skip_elems) { + info->flags = flags; + info->bucket_id = i; + info->skip_elems = count; + return elem; + } + count++; + } + + htab_unlock_bucket(htab, b, flags); + skip_elems = 0; + } + + info->bucket_id = i; + info->skip_elems = 0; + return NULL; +} + +static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + struct htab_elem *elem; + + elem = bpf_hash_map_seq_find_next(info, NULL); + if (!elem) + return NULL; + + if (*pos == 0) + ++*pos; + return elem; +} + +static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + + ++*pos; + ++info->skip_elems; + return bpf_hash_map_seq_find_next(info, v); +} + +static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + u32 roundup_key_size, roundup_value_size; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_map *map = info->map; + struct bpf_iter_meta meta; + int ret = 0, off = 0, cpu; + struct bpf_prog *prog; + void __percpu *pptr; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + roundup_key_size = round_up(map->key_size, 8); + ctx.key = elem->key; + if (!info->percpu_value_buf) { + ctx.value = elem->key + roundup_key_size; + } else { + roundup_value_size = round_up(map->value_size, 8); + pptr = htab_elem_get_ptr(elem, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu), + roundup_value_size); + off += roundup_value_size; + } + ctx.value = info->percpu_value_buf; + } + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_hash_map_seq_show(seq, v); +} + +static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + + if (!v) + (void)__bpf_hash_map_seq_show(seq, NULL); + else + htab_unlock_bucket(info->htab, + &info->htab->buckets[info->bucket_id], + info->flags); +} + +static int bpf_iter_init_hash_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + struct bpf_map *map = aux->map; + void *value_buf; + u32 buf_size; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); + if (!value_buf) + return -ENOMEM; + + seq_info->percpu_value_buf = value_buf; + } + + seq_info->map = map; + seq_info->htab = container_of(map, struct bpf_htab, map); + return 0; +} + +static void bpf_iter_fini_hash_map(void *priv_data) +{ + struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + + kfree(seq_info->percpu_value_buf); +} + +static const struct seq_operations bpf_hash_map_seq_ops = { + .start = bpf_hash_map_seq_start, + .next = bpf_hash_map_seq_next, + .stop = bpf_hash_map_seq_stop, + .show = bpf_hash_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_hash_map_seq_ops, + .init_seq_private = bpf_iter_init_hash_map, + .fini_seq_private = bpf_iter_fini_hash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), +}; + static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, @@ -1626,6 +1816,7 @@ const struct bpf_map_ops htab_map_ops = { BATCH_OPS(htab), .map_btf_name = "bpf_htab", .map_btf_id = &htab_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int htab_lru_map_btf_id; @@ -1643,6 +1834,7 @@ const struct bpf_map_ops htab_lru_map_ops = { BATCH_OPS(htab_lru), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_map_btf_id, + .iter_seq_info = &iter_seq_info, }; /* Called from eBPF program */ @@ -1760,6 +1952,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { BATCH_OPS(htab_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_percpu_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int htab_lru_percpu_map_btf_id; @@ -1775,6 +1968,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { BATCH_OPS(htab_lru_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_percpu_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8a1f9b3355d0..bcb68b55bf65 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -101,7 +101,29 @@ static struct bpf_iter_reg bpf_map_reg_info = { static int bpf_iter_check_map(struct bpf_prog *prog, struct bpf_iter_aux_info *aux) { - return -EINVAL; + u32 key_acc_size, value_acc_size, key_size, value_size; + struct bpf_map *map = aux->map; + bool is_percpu = false; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) + is_percpu = true; + else if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH) + return -EINVAL; + + key_acc_size = prog->aux->max_rdonly_access; + value_acc_size = prog->aux->max_rdwr_access; + key_size = map->key_size; + if (!is_percpu) + value_size = map->value_size; + else + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + + if (key_acc_size > key_size || value_acc_size > value_size) + return -EACCES; + + return 0; } DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, -- cgit v1.2.3 From d3cc2ab546adc6e52b65f36f7c34820d2830d0c9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:15 -0700 Subject: bpf: Implement bpf iterator for array maps The bpf iterators for array and percpu array are implemented. Similar to hash maps, for percpu array map, bpf program will receive values from all cpus. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184115.590532-1-yhs@fb.com --- kernel/bpf/arraymap.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/map_iter.c | 6 ++- 2 files changed, 142 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c66e8273fccd..8ff419b632a6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -487,6 +487,142 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +struct bpf_iter_seq_array_map_info { + struct bpf_map *map; + void *percpu_value_buf; + u32 index; +}; + +static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_map *map = info->map; + struct bpf_array *array; + u32 index; + + if (info->index >= map->max_entries) + return NULL; + + if (*pos == 0) + ++*pos; + array = container_of(map, struct bpf_array, map); + index = info->index & array->index_mask; + if (info->percpu_value_buf) + return array->pptrs[index]; + return array->value + array->elem_size * index; +} + +static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_map *map = info->map; + struct bpf_array *array; + u32 index; + + ++*pos; + ++info->index; + if (info->index >= map->max_entries) + return NULL; + + array = container_of(map, struct bpf_array, map); + index = info->index & array->index_mask; + if (info->percpu_value_buf) + return array->pptrs[index]; + return array->value + array->elem_size * index; +} + +static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_map *map = info->map; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int off = 0, cpu = 0; + void __percpu **pptr; + u32 size; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, v == NULL); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + + if (!info->percpu_value_buf) { + ctx.value = v; + } else { + pptr = v; + size = round_up(map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu), + size); + off += size; + } + ctx.value = info->percpu_value_buf; + } + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_array_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_array_map_seq_show(seq, v); +} + +static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_array_map_seq_show(seq, NULL); +} + +static int bpf_iter_init_array_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_array_map_info *seq_info = priv_data; + struct bpf_map *map = aux->map; + void *value_buf; + u32 buf_size; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); + if (!value_buf) + return -ENOMEM; + + seq_info->percpu_value_buf = value_buf; + } + + seq_info->map = map; + return 0; +} + +static void bpf_iter_fini_array_map(void *priv_data) +{ + struct bpf_iter_seq_array_map_info *seq_info = priv_data; + + kfree(seq_info->percpu_value_buf); +} + +static const struct seq_operations bpf_array_map_seq_ops = { + .start = bpf_array_map_seq_start, + .next = bpf_array_map_seq_next, + .stop = bpf_array_map_seq_stop, + .show = bpf_array_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_array_map_seq_ops, + .init_seq_private = bpf_iter_init_array_map, + .fini_seq_private = bpf_iter_fini_array_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), +}; + static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, @@ -506,6 +642,7 @@ const struct bpf_map_ops array_map_ops = { .map_update_batch = generic_map_update_batch, .map_btf_name = "bpf_array", .map_btf_id = &array_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int percpu_array_map_btf_id; @@ -521,6 +658,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_check_btf = array_map_check_btf, .map_btf_name = "bpf_array", .map_btf_id = &percpu_array_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int fd_array_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index bcb68b55bf65..fbe1f557cb88 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -106,10 +106,12 @@ static int bpf_iter_check_map(struct bpf_prog *prog, bool is_percpu = false; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) is_percpu = true; else if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH) + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) return -EINVAL; key_acc_size = prog->aux->max_rdonly_access; -- cgit v1.2.3 From 7b04d6d60fcfb5b2200ffebb9cfb90927bdfeec7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 23 Jul 2020 11:06:44 -0700 Subject: bpf: Separate bpf_get_[stack|stackid] for perf events BPF Calling get_perf_callchain() on perf_events from PEBS entries may cause unwinder errors. To fix this issue, the callchain is fetched early. Such perf_events are marked with __PERF_SAMPLE_CALLCHAIN_EARLY. Similarly, calling bpf_get_[stack|stackid] on perf_events from PEBS may also cause unwinder errors. To fix this, add separate version of these two helpers, bpf_get_[stack|stackid]_pe. These two hepers use callchain in bpf_perf_event_data_kern->data->callchain. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723180648.1429892-2-songliubraving@fb.com --- kernel/bpf/stackmap.c | 184 ++++++++++++++++++++++++++++++++++++++++++----- kernel/trace/bpf_trace.c | 4 +- 2 files changed, 168 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 48d8e739975f..5beb2f8c23da 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -387,11 +388,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) #endif } -BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, - u64, flags) +static long __bpf_get_stackid(struct bpf_map *map, + struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ @@ -399,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; - bool kernel = !user; u64 *ips; bool hash_matches; - if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | - BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) - return -EINVAL; - - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); - - if (unlikely(!trace)) - /* couldn't fetch the stack trace */ - return -EFAULT; - /* get_perf_callchain() guarantees that trace->nr >= init_nr * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ @@ -478,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, return id; } +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) +{ + u32 max_depth = map->value_size / stack_map_data_size(map); + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + return __bpf_get_stackid(map, trace, flags); +} + const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, @@ -487,7 +499,77 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +static __u64 count_kernel_ip(struct perf_callchain_entry *trace) +{ + __u64 nr_kernel = 0; + + while (nr_kernel < trace->nr) { + if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) + break; + nr_kernel++; + } + return nr_kernel; +} + +BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, + struct bpf_map *, map, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + __u64 nr_kernel; + int ret; + + /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return bpf_get_stackid((unsigned long)(ctx->regs), + (unsigned long) map, flags, 0, 0); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + trace = ctx->data->callchain; + if (unlikely(!trace)) + return -EFAULT; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + ret = __bpf_get_stackid(map, trace, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + return -EFAULT; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + ret = __bpf_get_stackid(map, trace, flags); + } + return ret; +} + +const struct bpf_func_proto bpf_get_stackid_proto_pe = { + .func = bpf_get_stackid_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; @@ -520,7 +602,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else init_nr = sysctl_perf_event_max_stack - num_elem; - if (kernel && task) + if (trace_in) + trace = trace_in; + else if (kernel && task) trace = get_callchain_entry_for_task(task, init_nr); else trace = get_perf_callchain(regs, init_nr, kernel, user, @@ -556,7 +640,7 @@ clear: BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { - return __bpf_get_stack(regs, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); } const struct bpf_func_proto bpf_get_stack_proto = { @@ -574,7 +658,7 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, { struct pt_regs *regs = task_pt_regs(task); - return __bpf_get_stack(regs, task, buf, size, flags); + return __bpf_get_stack(regs, task, NULL, buf, size, flags); } BTF_ID_LIST(bpf_get_task_stack_btf_ids) @@ -591,6 +675,70 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .btf_id = bpf_get_task_stack_btf_ids, }; +BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + int err = -EINVAL; + __u64 nr_kernel; + + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + err = -EFAULT; + trace = ctx->data->callchain; + if (unlikely(!trace)) + goto clear; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + err = __bpf_get_stack(ctx->regs, NULL, trace, buf, + size, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + goto clear; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + err = __bpf_get_stack(ctx->regs, NULL, trace, buf, + size, flags); + } + return err; + +clear: + memset(buf, 0, size); + return err; + +} + +const struct bpf_func_proto bpf_get_stack_proto_pe = { + .func = bpf_get_stack_pe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3cc0dcb60ca2..cb91ef902cc4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1411,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: - return &bpf_get_stackid_proto_tp; + return &bpf_get_stackid_proto_pe; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto_tp; + return &bpf_get_stack_proto_pe; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: -- cgit v1.2.3 From 5d99cb2c86775b4780c02a339a9578bf9471ead9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 23 Jul 2020 11:06:45 -0700 Subject: bpf: Fail PERF_EVENT_IOC_SET_BPF when bpf_get_[stack|stackid] cannot work bpf_get_[stack|stackid] on perf_events with precise_ip uses callchain attached to perf_sample_data. If this callchain is not presented, do not allow attaching BPF program that calls bpf_get_[stack|stackid] to this event. In the error case, -EPROTO is returned so that libbpf can identify this error and print proper hint message. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723180648.1429892-3-songliubraving@fb.com --- kernel/bpf/verifier.c | 3 +++ kernel/events/core.c | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d6979db48d8..cd14e70f2d07 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4962,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn env->prog->has_callchain_buf = true; } + if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) + env->prog->call_get_stack = true; + if (changes_data) clear_all_pkt_pointers(env); return 0; diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..ddcfd2fb5cc5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9544,6 +9544,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) if (IS_ERR(prog)) return PTR_ERR(prog); + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + bpf_prog_put(prog); + return -EPROTO; + } + event->prog = prog; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); -- cgit v1.2.3 From 7d9c3427894fe70d1347b4820476bf37736d2ff0 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 23 Jul 2020 23:47:43 -0500 Subject: bpf: Make cgroup storages shared between programs on the same cgroup This change comes in several parts: One, the restriction that the CGROUP_STORAGE map can only be used by one program is removed. This results in the removal of the field 'aux' in struct bpf_cgroup_storage_map, and removal of relevant code associated with the field, and removal of now-noop functions bpf_free_cgroup_storage and bpf_cgroup_storage_release. Second, we permit a key of type u64 as the key to the map. Providing such a key type indicates that the map should ignore attach type when comparing map keys. However, for simplicity newly linked storage will still have the attach type at link time in its key struct. cgroup_storage_check_btf is adapted to accept u64 as the type of the key. Third, because the storages are now shared, the storages cannot be unconditionally freed on program detach. There could be two ways to solve this issue: * A. Reference count the usage of the storages, and free when the last program is detached. * B. Free only when the storage is impossible to be referred to again, i.e. when either the cgroup_bpf it is attached to, or the map itself, is freed. Option A has the side effect that, when the user detach and reattach a program, whether the program gets a fresh storage depends on whether there is another program attached using that storage. This could trigger races if the user is multi-threaded, and since nondeterminism in data races is evil, go with option B. The both the map and the cgroup_bpf now tracks their associated storages, and the storage unlink and free are removed from cgroup_bpf_detach and added to cgroup_bpf_release and cgroup_storage_map_free. The latter also new holds the cgroup_mutex to prevent any races with the former. Fourth, on attach, we reuse the old storage if the key already exists in the map, via cgroup_storage_lookup. If the storage does not exist yet, we create a new one, and publish it at the last step in the attach process. This does not create a race condition because for the whole attach the cgroup_mutex is held. We keep track of an array of new storages that was allocated and if the process fails only the new storages would get freed. Signed-off-by: YiFei Zhu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/d5401c6106728a00890401190db40020a1f84ff1.1595565795.git.zhuyifei@google.com --- kernel/bpf/cgroup.c | 67 ++++++++------ kernel/bpf/core.c | 12 --- kernel/bpf/local_storage.c | 216 ++++++++++++++++++++++++--------------------- 3 files changed, 156 insertions(+), 139 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ac53102e244a..957cce1d5168 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -37,17 +37,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) } static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], - struct bpf_prog *prog) + struct bpf_cgroup_storage *new_storages[], + enum bpf_attach_type type, + struct bpf_prog *prog, + struct cgroup *cgrp) { enum bpf_cgroup_storage_type stype; + struct bpf_cgroup_storage_key key; + struct bpf_map *map; + + key.cgroup_inode_id = cgroup_id(cgrp); + key.attach_type = type; for_each_cgroup_storage_type(stype) { + map = prog->aux->cgroup_storage[stype]; + if (!map) + continue; + + storages[stype] = cgroup_storage_lookup((void *)map, &key, false); + if (storages[stype]) + continue; + storages[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storages[stype])) { - storages[stype] = NULL; - bpf_cgroup_storages_free(storages); + bpf_cgroup_storages_free(new_storages); return -ENOMEM; } + + new_storages[stype] = storages[stype]; } return 0; @@ -63,7 +80,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], } static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], - struct cgroup* cgrp, + struct cgroup *cgrp, enum bpf_attach_type attach_type) { enum bpf_cgroup_storage_type stype; @@ -72,14 +89,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); } -static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) -{ - enum bpf_cgroup_storage_type stype; - - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_unlink(storages[stype]); -} - /* Called when bpf_cgroup_link is auto-detached from dying cgroup. * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It * doesn't free link memory, which will eventually be done by bpf_link's @@ -101,22 +110,23 @@ static void cgroup_bpf_release(struct work_struct *work) struct cgroup *p, *cgrp = container_of(work, struct cgroup, bpf.release_work); struct bpf_prog_array *old_array; + struct list_head *storages = &cgrp->bpf.storages; + struct bpf_cgroup_storage *storage, *stmp; + unsigned int type; mutex_lock(&cgroup_mutex); for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; - struct bpf_prog_list *pl, *tmp; + struct bpf_prog_list *pl, *pltmp; - list_for_each_entry_safe(pl, tmp, progs, node) { + list_for_each_entry_safe(pl, pltmp, progs, node) { list_del(&pl->node); if (pl->prog) bpf_prog_put(pl->prog); if (pl->link) bpf_cgroup_link_auto_detach(pl->link); - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -126,6 +136,11 @@ static void cgroup_bpf_release(struct work_struct *work) bpf_prog_array_free(old_array); } + list_for_each_entry_safe(storage, stmp, storages, list_cg) { + bpf_cgroup_storage_unlink(storage); + bpf_cgroup_storage_free(storage); + } + mutex_unlock(&cgroup_mutex); for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) @@ -290,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_LIST_HEAD(&cgrp->bpf.storages); + for (i = 0; i < NR; i++) if (compute_effective_progs(cgrp, i, &arrays[i])) goto cleanup; @@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; - struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl; int err; @@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (IS_ERR(pl)) return PTR_ERR(pl); - if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) + if (bpf_cgroup_storages_alloc(storage, new_storage, type, + prog ? : link->link.prog, cgrp)) return -ENOMEM; if (pl) { old_prog = pl->prog; - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_assign(old_storage, pl->storage); } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storages_free(storage); + bpf_cgroup_storages_free(new_storage); return -ENOMEM; } list_add_tail(&pl->node, progs); @@ -480,12 +496,11 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (err) goto cleanup; - bpf_cgroup_storages_free(old_storage); if (old_prog) bpf_prog_put(old_prog); else static_branch_inc(&cgroup_bpf_enabled_key); - bpf_cgroup_storages_link(pl->storage, cgrp, type); + bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; cleanup: @@ -493,9 +508,7 @@ cleanup: pl->prog = old_prog; pl->link = NULL; } - bpf_cgroup_storages_free(pl->storage); - bpf_cgroup_storages_assign(pl->storage, old_storage); - bpf_cgroup_storages_link(pl->storage, cgrp, type); + bpf_cgroup_storages_free(new_storage); if (!old_prog) { list_del(&pl->node); kfree(pl); @@ -679,8 +692,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7be02e555ab9..bde93344164d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2097,24 +2097,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } -static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]); - } -} - void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; u32 i; - bpf_free_cgroup_storage(aux); for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 51bd5a8cb01b..3b2c70197d78 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,6 +9,8 @@ #include #include +#include "../cgroup/cgroup-internal.h" + DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF @@ -20,7 +22,6 @@ struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; - struct bpf_prog_aux *aux; struct rb_root root; struct list_head list; }; @@ -30,24 +31,41 @@ static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) return container_of(map, struct bpf_cgroup_storage_map, map); } -static int bpf_cgroup_storage_key_cmp( - const struct bpf_cgroup_storage_key *key1, - const struct bpf_cgroup_storage_key *key2) +static bool attach_type_isolated(const struct bpf_map *map) { - if (key1->cgroup_inode_id < key2->cgroup_inode_id) - return -1; - else if (key1->cgroup_inode_id > key2->cgroup_inode_id) - return 1; - else if (key1->attach_type < key2->attach_type) - return -1; - else if (key1->attach_type > key2->attach_type) - return 1; + return map->key_size == sizeof(struct bpf_cgroup_storage_key); +} + +static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, + const void *_key1, const void *_key2) +{ + if (attach_type_isolated(&map->map)) { + const struct bpf_cgroup_storage_key *key1 = _key1; + const struct bpf_cgroup_storage_key *key2 = _key2; + + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + } else { + const __u64 *cgroup_inode_id1 = _key1; + const __u64 *cgroup_inode_id2 = _key2; + + if (*cgroup_inode_id1 < *cgroup_inode_id2) + return -1; + else if (*cgroup_inode_id1 > *cgroup_inode_id2) + return 1; + } return 0; } -static struct bpf_cgroup_storage *cgroup_storage_lookup( - struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, - bool locked) +struct bpf_cgroup_storage * +cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, + void *key, bool locked) { struct rb_root *root = &map->root; struct rb_node *node; @@ -61,7 +79,7 @@ static struct bpf_cgroup_storage *cgroup_storage_lookup( storage = container_of(node, struct bpf_cgroup_storage, node); - switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { case -1: node = node->rb_left; break; @@ -93,7 +111,7 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, this = container_of(*new, struct bpf_cgroup_storage, node); parent = *new; - switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { case -1: new = &((*new)->rb_left); break; @@ -111,10 +129,9 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, return 0; } -static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; storage = cgroup_storage_lookup(map, key, false); @@ -124,17 +141,13 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) return &READ_ONCE(storage->buf)->data[0]; } -static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, +static int cgroup_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) - return -EINVAL; - - if (unlikely(flags & BPF_NOEXIST)) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && @@ -167,11 +180,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, void *value) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; @@ -197,11 +209,10 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; @@ -232,12 +243,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, return 0; } -static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, void *_next_key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage_key *next = _next_key; struct bpf_cgroup_storage *storage; spin_lock_bh(&map->lock); @@ -250,17 +259,23 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, if (!storage) goto enoent; - storage = list_next_entry(storage, list); + storage = list_next_entry(storage, list_map); if (!storage) goto enoent; } else { storage = list_first_entry(&map->list, - struct bpf_cgroup_storage, list); + struct bpf_cgroup_storage, list_map); } spin_unlock_bh(&map->lock); - next->attach_type = storage->key.attach_type; - next->cgroup_inode_id = storage->key.cgroup_inode_id; + + if (attach_type_isolated(&map->map)) { + struct bpf_cgroup_storage_key *next = _next_key; + *next = storage->key; + } else { + __u64 *next = _next_key; + *next = storage->key.cgroup_inode_id; + } return 0; enoent: @@ -275,7 +290,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) struct bpf_map_memory mem; int ret; - if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && + attr->key_size != sizeof(__u64)) return ERR_PTR(-EINVAL); if (attr->value_size == 0) @@ -318,6 +334,17 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *_map) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct list_head *storages = &map->list; + struct bpf_cgroup_storage *storage, *stmp; + + mutex_lock(&cgroup_mutex); + + list_for_each_entry_safe(storage, stmp, storages, list_map) { + bpf_cgroup_storage_unlink(storage); + bpf_cgroup_storage_free(storage); + } + + mutex_unlock(&cgroup_mutex); WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); @@ -335,49 +362,63 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - struct btf_member *m; - u32 offset, size; - - /* Key is expected to be of struct bpf_cgroup_storage_key type, - * which is: - * struct bpf_cgroup_storage_key { - * __u64 cgroup_inode_id; - * __u32 attach_type; - * }; - */ + if (attach_type_isolated(map)) { + struct btf_member *m; + u32 offset, size; + + /* Key is expected to be of struct bpf_cgroup_storage_key type, + * which is: + * struct bpf_cgroup_storage_key { + * __u64 cgroup_inode_id; + * __u32 attach_type; + * }; + */ + + /* + * Key_type must be a structure with two fields. + */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(key_type->info) != 2) + return -EINVAL; + + /* + * The first field must be a 64 bit integer at 0 offset. + */ + m = (struct btf_member *)(key_type + 1); + size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); + if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) + return -EINVAL; + + /* + * The second field must be a 32 bit integer at 64 bit offset. + */ + m++; + offset = offsetof(struct bpf_cgroup_storage_key, attach_type); + size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); + if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) + return -EINVAL; + } else { + u32 int_data; - /* - * Key_type must be a structure with two fields. - */ - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || - BTF_INFO_VLEN(key_type->info) != 2) - return -EINVAL; + /* + * Key is expected to be u64, which stores the cgroup_inode_id + */ - /* - * The first field must be a 64 bit integer at 0 offset. - */ - m = (struct btf_member *)(key_type + 1); - size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); - if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) - return -EINVAL; + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; - /* - * The second field must be a 32 bit integer at 64 bit offset. - */ - m++; - offset = offsetof(struct bpf_cgroup_storage_key, attach_type); - size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); - if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) - return -EINVAL; + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + } return 0; } -static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, +static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu; @@ -426,38 +467,13 @@ const struct bpf_map_ops cgroup_storage_map_ops = { int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - int ret = -EBUSY; - - spin_lock_bh(&map->lock); - if (map->aux && map->aux != aux) - goto unlock; if (aux->cgroup_storage[stype] && aux->cgroup_storage[stype] != _map) - goto unlock; + return -EBUSY; - map->aux = aux; aux->cgroup_storage[stype] = _map; - ret = 0; -unlock: - spin_unlock_bh(&map->lock); - - return ret; -} - -void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map) -{ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - - spin_lock_bh(&map->lock); - if (map->aux == aux) { - WARN_ON(aux->cgroup_storage[stype] != _map); - map->aux = NULL; - aux->cgroup_storage[stype] = NULL; - } - spin_unlock_bh(&map->lock); + return 0; } static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) @@ -578,7 +594,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, spin_lock_bh(&map->lock); WARN_ON(cgroup_storage_insert(map, storage)); - list_add(&storage->list, &map->list); + list_add(&storage->list_map, &map->list); + list_add(&storage->list_cg, &cgroup->bpf.storages); spin_unlock_bh(&map->lock); } @@ -596,7 +613,8 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) root = &map->root; rb_erase(&storage->node, root); - list_del(&storage->list); + list_del(&storage->list_map); + list_del(&storage->list_cg); spin_unlock_bh(&map->lock); } -- cgit v1.2.3 From dfcdf0e9ad2e006196986f363c99b2097aec5ef0 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Fri, 24 Jul 2020 16:17:53 -0500 Subject: bpf/local_storage: Fix build without CONFIG_CGROUP local_storage.o has its compile guard as CONFIG_BPF_SYSCALL, which does not imply that CONFIG_CGROUP is on. Including cgroup-internal.h when CONFIG_CGROUP is off cause a compilation failure. Fixes: f67cfc233706 ("bpf: Make cgroup storages shared between programs on the same cgroup") Reported-by: kernel test robot Signed-off-by: YiFei Zhu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200724211753.902969-1-zhuyifei1999@gmail.com --- kernel/bpf/local_storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 3b2c70197d78..571bb351ed3b 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,12 +9,12 @@ #include #include -#include "../cgroup/cgroup-internal.h" - DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF +#include "../cgroup/cgroup-internal.h" + #define LOCAL_STORAGE_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) -- cgit v1.2.3 From 2b9b305fcdda1810bdffeb599361174eb2cd0b7c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 24 Jul 2020 13:05:02 -0700 Subject: bpf: Fix build on architectures with special bpf_user_pt_regs_t Architectures like s390, powerpc, arm64, riscv have speical definition of bpf_user_pt_regs_t. So we need to cast the pointer before passing it to bpf_get_stack(). This is similar to bpf_get_stack_tp(). Fixes: 03d42fd2d83f ("bpf: Separate bpf_get_[stack|stackid] for perf events BPF") Reported-by: kernel test robot Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200724200503.3629591-1-songliubraving@fb.com --- kernel/bpf/stackmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 5beb2f8c23da..4fd830a62be2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -678,6 +678,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { + struct pt_regs *regs = (struct pt_regs *)(ctx->regs); struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; @@ -685,7 +686,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr_kernel; if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) - return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) @@ -705,8 +706,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr = trace->nr; trace->nr = nr_kernel; - err = __bpf_get_stack(ctx->regs, NULL, trace, buf, - size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); /* restore nr */ trace->nr = nr; @@ -718,8 +718,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; - err = __bpf_get_stack(ctx->regs, NULL, trace, buf, - size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); } return err; -- cgit v1.2.3 From aa8d3a716b59db6c1ad6c68fb8aa05e31980da60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:57 -0700 Subject: bpf, xdp: Add bpf_link-based XDP attachment API Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through BPF_LINK_CREATE command. bpf_xdp_link is mutually exclusive with direct BPF program attachment, previous BPF program should be detached prior to attempting to create a new bpf_xdp_link attachment (for a given XDP mode). Once BPF link is attached, it can't be replaced by other BPF program attachment or link attachment. It will be detached only when the last BPF link FD is closed. bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to how other BPF links behave (cgroup, flow_dissector). At that point bpf_link will become defunct, but won't be destroyed until last FD is closed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-5-andriin@fb.com --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee290b1f2d9e..0e8c88db7e7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2824,6 +2824,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_TRACING; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; + case BPF_XDP: + return BPF_PROG_TYPE_XDP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3921,6 +3923,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_XDP: + ret = bpf_xdp_link_attach(attr, prog); + break; default: ret = -EINVAL; } -- cgit v1.2.3 From aadfc2f957cb470a5a7e52cc41a2fa86e784bcd2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Jul 2020 11:19:51 +0200 Subject: entry: Correct 'noinstr' attributes The noinstr attribute is to be specified before the return type in the same way 'inline' is used. Similar cases were recently fixed for x86 in commit 7f6fa101dfac ("x86: Correct noinstr qualifiers"), but the generic entry code was based on the the original version and did not carry the fix over. Fixes: a5497bab5f72 ("entry: Provide generic interrupt entry/exit code") Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200725091951.744848-3-mingo@kernel.org --- kernel/entry/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 495f5c051b03..9852e0d62d95 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -256,7 +256,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) exit_to_user_mode(); } -irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs) +noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { irqentry_state_t ret = { .exit_rcu = false, @@ -333,7 +333,7 @@ void irqentry_exit_cond_resched(void) } } -void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state) +noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { lockdep_assert_irqs_disabled(); -- cgit v1.2.3 From 7665a47f70b3f64bf09c233cc7df73fde9e506f1 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Fri, 24 Jul 2020 11:05:31 +0200 Subject: signal: fix typo in dequeue_synchronous_signal() s/postive/positive/ Signed-off-by: Pavel Machek (CIP) Link: https://lore.kernel.org/r/20200724090531.GA14409@amd [christian.brauner@ubuntu.com: tweak commit message] Signed-off-by: Christian Brauner --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ee22ec78fd6d..6f16f7c5d375 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -719,7 +719,7 @@ static int dequeue_synchronous_signal(kernel_siginfo_t *info) * Return the first synchronous signal in the queue. */ list_for_each_entry(q, &pending->list, list) { - /* Synchronous signals have a postive si_code */ + /* Synchronous signals have a positive si_code */ if ((q->info.si_code > SI_USER) && (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { sync = q; -- cgit v1.2.3 From 45e9504f109b84986ba9d25a273a5d5bd3f3e9d9 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 16 Jul 2020 16:39:05 +0800 Subject: genirq/irqdomain: Remove redundant NULL pointer check on fwnode The is_fwnode_irqchip() helper will check if the fwnode_handle is empty. There is no need to perform a redundant check outside of it. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200716083905.287-1-yuzenghui@huawei.com --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index a4c2c915511d..155460fc0147 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -142,7 +142,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, if (!domain) return NULL; - if (fwnode && is_fwnode_irqchip(fwnode)) { + if (is_fwnode_irqchip(fwnode)) { fwid = container_of(fwnode, struct irqchip_fwid, fwnode); switch (fwid->type) { -- cgit v1.2.3 From 8a667928cfae337b85bee4fb0dbc09f3c2715856 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 10 Jul 2020 23:18:22 +0000 Subject: irqdomain: Export irq_domain_update_bus_token Add export for irq_domain_update_bus_token() so that we can allow drivers like the qcom-pdc driver to be loadable as a module. Signed-off-by: John Stultz Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Cc: Andy Gross Cc: Bjorn Andersson Cc: Joerg Roedel Cc: Thomas Gleixner Cc: Jason Cooper Cc: Marc Zyngier Cc: Linus Walleij Cc: Maulik Shah Cc: Lina Iyer Cc: Saravana Kannan Cc: Todd Kjos Cc: Greg Kroah-Hartman Cc: linux-arm-msm@vger.kernel.org Cc: iommu@lists.linux-foundation.org Cc: linux-gpio@vger.kernel.org Link: https://lore.kernel.org/r/20200710231824.60699-2-john.stultz@linaro.org --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 155460fc0147..76cd7ebd1178 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -281,6 +281,7 @@ void irq_domain_update_bus_token(struct irq_domain *domain, mutex_unlock(&irq_domain_mutex); } +EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs -- cgit v1.2.3 From 8d16f5b979660c0fdcfa21a418cc03f1fde60cf7 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 10 Jul 2020 23:18:23 +0000 Subject: genirq: Export irq_chip_retrigger_hierarchy and irq_chip_set_vcpu_affinity_parent Add EXPORT_SYMBOL_GPL entries for irq_chip_retrigger_hierarchy() and irq_chip_set_vcpu_affinity_parent() so that we can allow drivers like the qcom-pdc driver to be loadable as a module. Signed-off-by: John Stultz Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Cc: Andy Gross Cc: Bjorn Andersson Cc: Joerg Roedel Cc: Thomas Gleixner Cc: Jason Cooper Cc: Marc Zyngier Cc: Linus Walleij Cc: Maulik Shah Cc: Lina Iyer Cc: Saravana Kannan Cc: Todd Kjos Cc: Greg Kroah-Hartman Cc: linux-arm-msm@vger.kernel.org Cc: iommu@lists.linux-foundation.org Cc: linux-gpio@vger.kernel.org Link: https://lore.kernel.org/r/20200710231824.60699-3-john.stultz@linaro.org --- kernel/irq/chip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 41e7e37a0928..ba6ce66d7ed6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1478,6 +1478,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) return 0; } +EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy); /** * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt @@ -1492,7 +1493,7 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) return -ENOSYS; } - +EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent); /** * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt * @data: Pointer to interrupt specific data -- cgit v1.2.3 From ed00495333ccc80fc8fb86fb43773c3c2a499466 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Mon, 27 Jul 2020 14:48:52 +0200 Subject: locking/lockdep: Fix TRACE_IRQFLAGS vs. NMIs Prior to commit: 859d069ee1dd ("lockdep: Prepare for NMI IRQ state tracking") IRQ state tracking was disabled in NMIs due to nmi_enter() doing lockdep_off() -- with the obvious requirement that NMI entry call nmi_enter() before trace_hardirqs_off(). [ AFAICT, PowerPC and SH violate this order on their NMI entry ] However, that commit explicitly changed lockdep_hardirqs_*() to ignore lockdep_off() and breaks every architecture that has irq-tracing in it's NMI entry that hasn't been fixed up (x86 being the only fixed one at this point). The reason for this change is that by ignoring lockdep_off() we can: - get rid of 'current->lockdep_recursion' in lockdep_assert_irqs*() which was going to to give header-recursion issues with the seqlock rework. - allow these lockdep_assert_*() macros to function in NMI context. Restore the previous state of things and allow an architecture to opt-in to the NMI IRQ tracking support, however instead of relying on lockdep_off(), rely on in_nmi(), both are part of nmi_enter() and so over-all entry ordering doesn't need to change. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200727124852.GK119549@hirez.programming.kicks-ass.net --- kernel/locking/lockdep.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d595623c4b34..8b0b28b4546b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3712,6 +3712,9 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) * and not rely on hardware state like normal interrupts. */ if (unlikely(in_nmi())) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + /* * Skip: * - recursion check, because NMI can hit lockdep; @@ -3773,7 +3776,10 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) * they will restore the software state. This ensures the software * state is consistent inside NMIs as well. */ - if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK))) + if (in_nmi()) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) return; /* -- cgit v1.2.3 From f0c7baca180046824e07fc5f1326e83a8fd150c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2020 22:44:41 +0200 Subject: genirq/affinity: Make affinity setting if activated opt-in John reported that on a RK3288 system the perf per CPU interrupts are all affine to CPU0 and provided the analysis: "It looks like what happens is that because the interrupts are not per-CPU in the hardware, armpmu_request_irq() calls irq_force_affinity() while the interrupt is deactivated and then request_irq() with IRQF_PERCPU | IRQF_NOBALANCING. Now when irq_startup() runs with IRQ_STARTUP_NORMAL, it calls irq_setup_affinity() which returns early because IRQF_PERCPU and IRQF_NOBALANCING are set, leaving the interrupt on its original CPU." This was broken by the recent commit which blocked interrupt affinity setting in hardware before activation of the interrupt. While this works in general, it does not work for this particular case. As contrary to the initial analysis not all interrupt chip drivers implement an activate callback, the safe cure is to make the deferred interrupt affinity setting at activation time opt-in. Implement the necessary core logic and make the two irqchip implementations for which this is required opt-in. In hindsight this would have been the right thing to do, but ... Fixes: baedb87d1b53 ("genirq/affinity: Handle affinity setting on inactive interrupts correctly") Reported-by: John Keeping Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87blk4tzgm.fsf@nanos.tec.linutronix.de --- kernel/irq/manage.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2a9fec53e159..48c38e09c673 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -320,12 +320,16 @@ static bool irq_set_affinity_deactivated(struct irq_data *data, struct irq_desc *desc = irq_data_to_desc(data); /* + * Handle irq chips which can handle affinity only in activated + * state correctly + * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * useable state so startup works. */ - if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data)) + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || + irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); -- cgit v1.2.3 From aa251fc5b936d3ddb4b4c4b36427eb9aa3347c82 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 25 Jul 2020 13:30:55 +0100 Subject: genirq/debugfs: Add missing irqchip flags Recently introduced irqchip flags lack the corresponding printouts in debugfs. Add them. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/874kpvydxc.wl-maz@kernel.org --- kernel/irq/debugfs.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4f9f844074db..b95ff5d5f4bd 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -112,6 +112,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), + BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), @@ -120,6 +121,10 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_WAKEUP_STATE), BIT_MASK_DESCR(IRQD_WAKEUP_ARMED), + + BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), + + BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), }; static const struct irq_bit_descr irqdesc_states[] = { -- cgit v1.2.3 From b54cecf5e2293d15620f7b3f8d1bf486243d5643 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 7 Jun 2020 12:10:40 +0300 Subject: fsnotify: pass dir argument to handle_event() callback The 'inode' argument to handle_event(), sometimes referred to as 'to_tell' is somewhat obsolete. It is a remnant from the times when a group could only have an inode mark associated with an event. We now pass an iter_info array to the callback, with all marks associated with an event. Most backends ignore this argument, with two exceptions: 1. dnotify uses it for sanity check that event is on directory 2. fanotify uses it to report fid of directory on directory entry modification events Remove the 'inode' argument and add a 'dir' argument. The callback function signature is deliberately changed, because the meaning of the argument has changed and the arguments have been documented. The 'dir' argument is set to when 'file_name' is specified and it is referring to the directory that the 'file_name' entry belongs to. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- kernel/audit_fsnotify.c | 10 +++++----- kernel/audit_tree.c | 6 +++--- kernel/audit_watch.c | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 3596448bfdab..30ca239285a3 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -152,11 +152,11 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) } /* Update mark data in audit rules based on fsnotify events. */ -static int audit_mark_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, - const struct qstr *dname, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_mark_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, + const struct qstr *dname, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct audit_fsnotify_mark *audit_mark; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e49c912f862d..2ce2ac1ce100 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1037,9 +1037,9 @@ static void evict_chunk(struct audit_chunk *chunk) audit_schedule_prune(); } -static int audit_tree_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, +static int audit_tree_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e09c551ae52d..61fd601f1edf 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -464,9 +464,9 @@ void audit_remove_watch_rule(struct audit_krule *krule) } /* Update watch data in audit rules based on fsnotify events. */ -static int audit_watch_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, +static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { -- cgit v1.2.3 From b4e9c9549f62329d2412f899635fddc5212b9cd4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 1 Jun 2020 19:42:40 -0400 Subject: introduction of regset ->get() wrappers, switching ELF coredumps to those Two new helpers: given a process and regset, dump into a buffer. regset_get() takes a buffer and size, regset_get_alloc() takes size and allocates a buffer. Return value in both cases is the amount of data actually dumped in case of success or -E... on error. In both cases the size is capped by regset->n * regset->size, so ->get() is called with offset 0 and size no more than what regset expects. binfmt_elf.c callers of ->get() are switched to using those; the other caller (copy_regset_to_user()) will need some preparations to switch. Signed-off-by: Al Viro --- kernel/Makefile | 2 +- kernel/regset.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 kernel/regset.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..e6e03380a0f1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o ucount.o + async.o range.o smpboot.o ucount.o regset.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/regset.c b/kernel/regset.c new file mode 100644 index 000000000000..6b39fa0993ec --- /dev/null +++ b/kernel/regset.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +static int __regset_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void **data) +{ + void *p = *data, *to_free = NULL; + int res; + + if (!regset->get) + return -EOPNOTSUPP; + if (size > regset->n * regset->size) + size = regset->n * regset->size; + if (!p) { + to_free = p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + } + res = regset->get(target, regset, 0, size, p, NULL); + if (unlikely(res < 0)) { + kfree(to_free); + return res; + } + *data = p; + if (regset->get_size) { // arm64-only kludge, will go away + unsigned max_size = regset->get_size(target, regset); + if (size > max_size) + size = max_size; + } + return size; +} + +int regset_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void *data) +{ + return __regset_get(target, regset, size, &data); +} +EXPORT_SYMBOL(regset_get); + +int regset_get_alloc(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void **data) +{ + *data = NULL; + return __regset_get(target, regset, size, data); +} +EXPORT_SYMBOL(regset_get_alloc); -- cgit v1.2.3 From dc12d7968f9c9540494deb1285854b18ca4465ec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 17 Feb 2020 12:25:14 -0500 Subject: copy_regset_to_user(): do all copyout at once. Turn copy_regset_to_user() into regset_get_alloc() + copy_to_user(). Now all ->get() calls have a kernel buffer as destination. Note that we'd already eliminated the callers of copy_regset_to_user() with non-zero offset; now that argument is simply unused. Uninlined, while we are at it. Signed-off-by: Al Viro --- kernel/regset.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/regset.c b/kernel/regset.c index 6b39fa0993ec..0a610983ce43 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -52,3 +52,29 @@ int regset_get_alloc(struct task_struct *target, return __regset_get(target, regset, size, data); } EXPORT_SYMBOL(regset_get_alloc); + +/** + * copy_regset_to_user - fetch a thread's user_regset data into user memory + * @target: thread to be examined + * @view: &struct user_regset_view describing user thread machine state + * @setno: index in @view->regsets + * @offset: offset into the regset data, in bytes + * @size: amount of data to copy, in bytes + * @data: user-mode pointer to copy into + */ +int copy_regset_to_user(struct task_struct *target, + const struct user_regset_view *view, + unsigned int setno, + unsigned int offset, unsigned int size, + void __user *data) +{ + const struct user_regset *regset = &view->regsets[setno]; + void *buf; + int ret; + + ret = regset_get_alloc(target, regset, size, &buf); + if (ret > 0) + ret = copy_to_user(data, buf, ret) ? -EFAULT : 0; + kfree(buf); + return ret; +} -- cgit v1.2.3 From 7717cb9bdd0421faa432a4e0d499fdba6e2394c8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Feb 2020 20:48:16 -0500 Subject: regset: new method and helpers for it ->regset_get() takes task+regset+buffer, returns the amount of free space left in the buffer on success and -E... on error. buffer is represented as struct membuf - a pair of (kernel) pointer and amount of space left Primitives for writing to such: * membuf_write(buf, data, size) * membuf_zero(buf, size) * membuf_store(buf, value) These are implemented as inlines (in case of membuf_store - a macro). All writes are sequential; they become no-ops when there's no space left. Return value of all primitives is the amount of space left after the operation, so they can be used as return values of ->regset_get(). Example of use: // stores pt_regs of task + 64 bytes worth of zeroes + 32bit PID of task int foo_get(struct task_struct *task, const struct regset *regset, struct membuf to) { membuf_write(&to, task_pt_regs(task), sizeof(struct pt_regs)); membuf_zero(&to, 64); return membuf_store(&to, (u32)task_tgid_vnr(task)); } regset_get()/regset_get_alloc() taught to use that thing if present. By the end of the series all users of ->get() will be converted; then ->get() and ->get_size() can go. Note that unlike ->get() this thing always starts at offset 0 and, since it only writes to kernel buffer, can't fail on copyout. It can, of course, fail for other reasons, but those tend to be less numerous. The caller guarantees that the buffer size won't be bigger than regset->n * regset->size. That simplifies life for quite a few instances. Signed-off-by: Al Viro --- kernel/regset.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/regset.c b/kernel/regset.c index 0a610983ce43..eaeaefbbd39e 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -11,7 +11,7 @@ static int __regset_get(struct task_struct *target, void *p = *data, *to_free = NULL; int res; - if (!regset->get) + if (!regset->get && !regset->regset_get) return -EOPNOTSUPP; if (size > regset->n * regset->size) size = regset->n * regset->size; @@ -20,6 +20,16 @@ static int __regset_get(struct task_struct *target, if (!p) return -ENOMEM; } + if (regset->regset_get) { + res = regset->regset_get(target, regset, + (struct membuf){.p = p, .left = size}); + if (res < 0) { + kfree(to_free); + return res; + } + *data = p; + return size - res; + } res = regset->get(target, regset, 0, size, p, NULL); if (unlikely(res < 0)) { kfree(to_free); -- cgit v1.2.3 From 1e6986c9db21265bac1435a344b4446c51a3f4d8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2020 15:34:20 -0400 Subject: regset: kill ->get() no instances left Signed-off-by: Al Viro --- kernel/regset.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/regset.c b/kernel/regset.c index eaeaefbbd39e..586823786f39 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -11,7 +11,7 @@ static int __regset_get(struct task_struct *target, void *p = *data, *to_free = NULL; int res; - if (!regset->get && !regset->regset_get) + if (!regset->regset_get) return -EOPNOTSUPP; if (size > regset->n * regset->size) size = regset->n * regset->size; @@ -20,28 +20,14 @@ static int __regset_get(struct task_struct *target, if (!p) return -ENOMEM; } - if (regset->regset_get) { - res = regset->regset_get(target, regset, - (struct membuf){.p = p, .left = size}); - if (res < 0) { - kfree(to_free); - return res; - } - *data = p; - return size - res; - } - res = regset->get(target, regset, 0, size, p, NULL); - if (unlikely(res < 0)) { + res = regset->regset_get(target, regset, + (struct membuf){.p = p, .left = size}); + if (res < 0) { kfree(to_free); return res; } *data = p; - if (regset->get_size) { // arm64-only kludge, will go away - unsigned max_size = regset->get_size(target, regset); - if (size > max_size) - size = max_size; - } - return size; + return size - res; } int regset_get(struct task_struct *target, -- cgit v1.2.3 From 82ace1efb3cb1d49a1681cc6e31156047d5ae1f2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Jul 2020 15:58:44 +0300 Subject: fsnotify: create helper fsnotify_inode() Simple helper to consolidate biolerplate code. Link: https://lore.kernel.org/r/20200722125849.17418-5-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..0c655c039506 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1543,8 +1543,7 @@ static void latency_fsnotify_workfn(struct work_struct *work) { struct trace_array *tr = container_of(work, struct trace_array, fsnotify_work); - fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, - tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); } static void latency_fsnotify_workfn_irq(struct irq_work *iwork) -- cgit v1.2.3 From 7dbe6080167860df0bf8627e55fd5154f366cc7a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Jul 2020 15:58:43 +0300 Subject: audit: do not set FS_EVENT_ON_CHILD in audit marks mask The audit group marks mask does not contain any events possible on a child so setting the flag FS_EVENT_ON_CHILD in the mask is counter productive. It may lead to the undesired outcome of setting the dentry flag DCACHE_FSNOTIFY_PARENT_WATCHED on a directory inode even though it is not watching children, because the audit mark contribute the flag FS_EVENT_ON_CHILD to the inode's fsnotify_mask and another mark could be contributing an event that is possible on child to the inode's mask. Furthermore in the following patches we want to use FS_EVENT_ON_CHILD for non-dir inodes for other purposes so stop using the flag. Link: https://lore.kernel.org/r/20200722125849.17418-4-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- kernel/audit_fsnotify.c | 2 +- kernel/audit_watch.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 30ca239285a3..bd3a6b79316a 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -36,7 +36,7 @@ static struct fsnotify_group *audit_fsnotify_group; /* fsnotify events we care about. */ #define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF) static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 61fd601f1edf..e23d54bcc587 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -53,7 +53,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) + FS_MOVE_SELF | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { -- cgit v1.2.3 From b9a1b9772509cbc6f6aa8bcd0b019f6347a2b631 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Jul 2020 15:58:48 +0300 Subject: fsnotify: create method handle_inode_event() in fsnotify_operations The method handle_event() grew a lot of complexity due to the design of fanotify and merging of ignore masks. Most backends do not care about this complex functionality, so we can hide this complexity from them. Introduce a method handle_inode_event() that serves those backends and passes a single inode mark and less arguments. This change converts all backends except fanotify and inotify to use the simplified handle_inode_event() method. In pricipal, inotify could have also used the new method, but that would require passing more arguments on the simple helper (data, data_type, cookie), so we leave it with the handle_event() method. Link: https://lore.kernel.org/r/20200722125849.17418-9-amir73il@gmail.com Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- kernel/audit_fsnotify.c | 20 ++++++++------------ kernel/audit_tree.c | 10 ++++------ kernel/audit_watch.c | 17 +++++++---------- 3 files changed, 19 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index bd3a6b79316a..bfcfcd61adb6 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -152,35 +152,31 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) } /* Update mark data in audit rules based on fsnotify events. */ -static int audit_mark_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *dname, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *dname) { - struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct audit_fsnotify_mark *audit_mark; - const struct inode *inode = fsnotify_data_inode(data, data_type); audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); - BUG_ON(group != audit_fsnotify_group); - - if (WARN_ON(!inode)) + if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) || + WARN_ON_ONCE(!inode)) return 0; if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) return 0; audit_update_mark(audit_mark, inode); - } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) { audit_autoremove_mark_rule(audit_mark); + } return 0; } static const struct fsnotify_ops audit_mark_fsnotify_ops = { - .handle_event = audit_mark_handle_event, + .handle_inode_event = audit_mark_handle_event, .free_mark = audit_fsnotify_free_mark, }; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2ce2ac1ce100..025d24abf15d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1037,11 +1037,9 @@ static void evict_chunk(struct audit_chunk *chunk) audit_schedule_prune(); } -static int audit_tree_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *file_name, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *file_name) { return 0; } @@ -1070,7 +1068,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark, } static const struct fsnotify_ops audit_tree_ops = { - .handle_event = audit_tree_handle_event, + .handle_inode_event = audit_tree_handle_event, .freeing_mark = audit_tree_freeing_mark, .free_mark = audit_tree_destroy_watch, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e23d54bcc587..246e5ba704c0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -464,20 +464,17 @@ void audit_remove_watch_rule(struct audit_krule *krule) } /* Update watch data in audit rules based on fsnotify events. */ -static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *dname, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *dname) { - struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); - const struct inode *inode = fsnotify_data_inode(data, data_type); struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); - BUG_ON(group != audit_watch_group); - WARN_ON(!inode); + if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) || + WARN_ON_ONCE(!inode)) + return 0; if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); @@ -490,7 +487,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .handle_event = audit_watch_handle_event, + .handle_inode_event = audit_watch_handle_event, .free_mark = audit_watch_free_mark, }; -- cgit v1.2.3 From f6dfbe31e8fa5cbd5bc89df9d7f0fa0af7e69981 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 27 Jul 2020 18:54:11 +0100 Subject: bpf: Fix swapped arguments in calls to check_buffer_access There are a couple of arguments of the boolean flag zero_size_allowed and the char pointer buf_info when calling to function check_buffer_access that are swapped by mistake. Fix these by swapping them to correct the argument ordering. Fixes: afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier") Addresses-Coverity: ("Array compared to 0") Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200727175411.155179-1-colin.king@canonical.com --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cd14e70f2d07..88bb25d08bf8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3477,14 +3477,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regno, reg_type_str[reg->type]); return -EACCES; } - err = check_buffer_access(env, reg, regno, off, size, "rdonly", - false, + err = check_buffer_access(env, reg, regno, off, size, false, + "rdonly", &env->prog->aux->max_rdonly_access); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_RDWR_BUF) { - err = check_buffer_access(env, reg, regno, off, size, "rdwr", - false, + err = check_buffer_access(env, reg, regno, off, size, false, + "rdwr", &env->prog->aux->max_rdwr_access); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); -- cgit v1.2.3 From 112a0e4171e111e963aada3fe790c71accf4d705 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 28 Jul 2020 16:34:00 +0900 Subject: kprobes: Remove unnecessary module_mutex locking from kprobe_optimizer() Since we already lock both kprobe_mutex and text_mutex in the optimizer, text will not be changed and the module unloading will be stopped inside kprobes_module_callback(). The mutex_lock() has originally been introduced to avoid conflict with text modification, at that point we didn't hold text_mutex. But after: f1c6ece23729 ("kprobes: Fix potential deadlock in kprobe_optimizer()") We started holding the text_mutex and don't need the modules mutex anyway. So remove the module_mutex locking. [ mingo: Amended the changelog. ] Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Cc: Jarkko Sakkinen Link: https://lore.kernel.org/r/20200728163400.e00b09c594763349f99ce6cb@kernel.org --- kernel/kprobes.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 146c648eb943..e87679a48ba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -598,8 +598,6 @@ static void kprobe_optimizer(struct work_struct *work) mutex_lock(&kprobe_mutex); cpus_read_lock(); mutex_lock(&text_mutex); - /* Lock modules while optimizing kprobes */ - mutex_lock(&module_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -624,7 +622,6 @@ static void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(); - mutex_unlock(&module_mutex); mutex_unlock(&text_mutex); cpus_read_unlock(); -- cgit v1.2.3 From 21a6ee14a8f277766618ef07154432b46528113e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 28 Jul 2020 19:17:55 +0800 Subject: sched: Remove duplicated tick_nohz_full_enabled() check In sched_update_tick_dependency() there's two calls that check whether nohz_full is enabled: tick_nohz_full_cpu() does it implicitly, while there's also an explicit call to tick_nohz_full_enabled(). Remove the duplicated, open coded check. [ mingo: Amended the changelog. ] Signed-off-by: Miaohe Lin Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/1595935075-14223-1-git-send-email-linmiaohe@huawei.com --- kernel/sched/sched.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f33c77258ea..296efd30d8c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1961,12 +1961,7 @@ extern int __init sched_tick_offload_init(void); */ static inline void sched_update_tick_dependency(struct rq *rq) { - int cpu; - - if (!tick_nohz_full_enabled()) - return; - - cpu = cpu_of(rq); + int cpu = cpu_of(rq); if (!tick_nohz_full_cpu(cpu)) return; -- cgit v1.2.3 From 274b3f7bf34415eed106e479e4815e897ce5d763 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Jul 2020 16:33:43 +0200 Subject: dma-contiguous: cleanup dma_alloc_contiguous Split out a cma_alloc_aligned helper to deal with the "interesting" calling conventions for cma_alloc, which then allows to the main function to be written straight forward. This also takes advantage of the fact that NULL dev arguments have been gone from the DMA API for a while. Signed-off-by: Christoph Hellwig Reviewed-by: Nicolin Chen Reviewed-by: Barry Song --- kernel/dma/contiguous.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 15bc5026c485..cff7e60968b9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -215,6 +215,13 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, return cma_release(dev_get_cma_area(dev), pages, count); } +static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) +{ + unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT); + + return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN); +} + /** * dma_alloc_contiguous() - allocate contiguous pages * @dev: Pointer to device for which the allocation is performed. @@ -231,24 +238,14 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - size_t count = size >> PAGE_SHIFT; - struct page *page = NULL; - struct cma *cma = NULL; - - if (dev && dev->cma_area) - cma = dev->cma_area; - else if (count > 1) - cma = dma_contiguous_default_area; - /* CMA can be used only in the context which permits sleeping */ - if (cma && gfpflags_allow_blocking(gfp)) { - size_t align = get_order(size); - size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); - - page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); - } - - return page; + if (!gfpflags_allow_blocking(gfp)) + return NULL; + if (dev->cma_area) + return cma_alloc_aligned(dev->cma_area, size, gfp); + if (size <= PAGE_SIZE || !dma_contiguous_default_area) + return NULL; + return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); } /** -- cgit v1.2.3 From 310ad7970a0dec847563dc6dba9e7e587d545622 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Jul 2020 12:05:27 -0700 Subject: bpf: Fix build without CONFIG_NET when using BPF XDP link Entire net/core subsystem is not built without CONFIG_NET. linux/netdevice.h just assumes that it's always there, so the easiest way to fix this is to conditionally compile out bpf_xdp_link_attach() use in bpf/syscall.c. Fixes: aa8d3a716b59 ("bpf, xdp: Add bpf_link-based XDP attachment API") Reported-by: Randy Dunlap Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Randy Dunlap # build-tested Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200728190527.110830-1-andriin@fb.com --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0e8c88db7e7a..cd3d599e9e90 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3923,9 +3923,11 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; +#ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; +#endif default: ret = -EINVAL; } -- cgit v1.2.3 From 48001ea50d17f3eb06a552e9ecf21f7fc01b25da Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 20 Jul 2020 15:08:18 -0700 Subject: PM, libnvdimm: Add runtime firmware activation support Abstract platform specific mechanics for nvdimm firmware activation behind a handful of generic ops. At the bus level ->activate_state() indicates the unified state (idle, busy, armed) of all DIMMs on the bus, and ->capability() indicates the system state expectations for activate. At the DIMM level ->activate_state() indicates the per-DIMM state, ->activate_result() indicates the outcome of the last activation attempt, and ->arm() attempts to transition the DIMM from 'idle' to 'armed'. A new hibernate_quiet_exec() facility is added to support firmware activation in an OS defined system quiesce state. It leverages the fact that the hibernate-freeze state wants to assert that a memory hibernation snapshot can be taken. This is in contrast to a platform firmware defined quiesce state that may forcefully quiet the memory controller independent of whether an individual device-driver properly supports hibernate-freeze. The libnvdimm sysfs interface is extended to support detection of a firmware activate capability. The mechanism supports enumeration and triggering of firmware activate, optionally in the hibernate_quiet_exec() context. [rafael: hibernate_quiet_exec() proposal] [vishal: fix up sparse warning, grammar in Documentation/] Cc: Pavel Machek Cc: Ira Weiny Cc: Len Brown Cc: Jonathan Corbet Cc: Dave Jiang Cc: Vishal Verma Reported-by: kernel test robot Co-developed-by: "Rafael J. Wysocki" Signed-off-by: "Rafael J. Wysocki" Signed-off-by: Dan Williams Signed-off-by: Vishal Verma --- kernel/power/hibernate.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 02ec716a4927..e6fab3f09c98 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -795,6 +795,103 @@ int hibernate(void) return error; } +/** + * hibernate_quiet_exec - Execute a function with all devices frozen. + * @func: Function to execute. + * @data: Data pointer to pass to @func. + * + * Return the @func return value or an error code if it cannot be executed. + */ +int hibernate_quiet_exec(int (*func)(void *data), void *data) +{ + int error, nr_calls = 0; + + lock_system_sleep(); + + if (!hibernate_acquire()) { + error = -EBUSY; + goto unlock; + } + + pm_prepare_console(); + + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; + goto exit; + } + + error = freeze_processes(); + if (error) + goto exit; + + lock_device_hotplug(); + + pm_suspend_clear_flags(); + + error = platform_begin(true); + if (error) + goto thaw; + + error = freeze_kernel_threads(); + if (error) + goto thaw; + + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto dpm_complete; + + suspend_console(); + + error = dpm_suspend(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = platform_pre_snapshot(true); + if (error) + goto skip; + + error = func(data); + +skip: + platform_finish(true); + + dpm_resume_start(PMSG_THAW); + +dpm_resume: + dpm_resume(PMSG_THAW); + + resume_console(); + +dpm_complete: + dpm_complete(PMSG_THAW); + + thaw_kernel_threads(); + +thaw: + platform_end(true); + + unlock_device_hotplug(); + + thaw_processes(); + +exit: + __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + + pm_restore_console(); + + hibernate_release(); + +unlock: + unlock_system_sleep(); + + return error; +} +EXPORT_SYMBOL_GPL(hibernate_quiet_exec); /** * software_resume - Resume from a saved hibernation image. -- cgit v1.2.3 From 4fd5750af02ab7bba7c58a073060cc1da8a69173 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2020 23:49:18 +0200 Subject: sched,tracing: Convert to sched_set_fifo() One module user of sched_setscheduler() was overlooked and is obviously causing build failures. Convert ring_buffer_benchmark to use sched_set_fifo_low() when fifo==1 and sched_set_fifo() when fifo==2. This is a bit of an abuse, but it makes the thing 'work' again. Specifically, it enables all combinations that were previously possible: producer higher than consumer consumer higher than producer Fixes: 616d91b68cd5 ("sched: Remove sched_setscheduler*() EXPORTs") Reported-by: kernel test robot Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20200720214918.GM5523@worktop.programming.kicks-ass.net --- kernel/trace/ring_buffer_benchmark.c | 48 +++++++++++++++++------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 8df0aa810950..78e576575b79 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); static int producer_nice = MAX_NICE; static int consumer_nice = MAX_NICE; -static int producer_fifo = -1; -static int consumer_fifo = -1; +static int producer_fifo; +static int consumer_fifo; module_param(producer_nice, int, 0644); MODULE_PARM_DESC(producer_nice, "nice prio for producer"); @@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644); MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); module_param(producer_fifo, int, 0644); -MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); +MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo"); module_param(consumer_fifo, int, 0644); -MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); +MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo"); static int read_events; @@ -303,22 +303,22 @@ static void ring_buffer_producer(void) trace_printk("ERROR!\n"); if (!disable_reader) { - if (consumer_fifo < 0) + if (consumer_fifo) + trace_printk("Running Consumer at SCHED_FIFO %s\n", + consumer_fifo == 1 ? "low" : "high"); + else trace_printk("Running Consumer at nice: %d\n", consumer_nice); - else - trace_printk("Running Consumer at SCHED_FIFO %d\n", - consumer_fifo); } - if (producer_fifo < 0) + if (producer_fifo) + trace_printk("Running Producer at SCHED_FIFO %s\n", + producer_fifo == 1 ? "low" : "high"); + else trace_printk("Running Producer at nice: %d\n", producer_nice); - else - trace_printk("Running Producer at SCHED_FIFO %d\n", - producer_fifo); /* Let the user know that the test is running at low priority */ - if (producer_fifo < 0 && consumer_fifo < 0 && + if (!producer_fifo && !consumer_fifo && producer_nice == MAX_NICE && consumer_nice == MAX_NICE) trace_printk("WARNING!!! This test is running at lowest priority.\n"); @@ -455,21 +455,19 @@ static int __init ring_buffer_benchmark_init(void) * Run them as low-prio background tasks by default: */ if (!disable_reader) { - if (consumer_fifo >= 0) { - struct sched_param param = { - .sched_priority = consumer_fifo - }; - sched_setscheduler(consumer, SCHED_FIFO, ¶m); - } else + if (consumer_fifo >= 2) + sched_set_fifo(consumer); + else if (consumer_fifo == 1) + sched_set_fifo_low(consumer); + else set_user_nice(consumer, consumer_nice); } - if (producer_fifo >= 0) { - struct sched_param param = { - .sched_priority = producer_fifo - }; - sched_setscheduler(producer, SCHED_FIFO, ¶m); - } else + if (producer_fifo >= 2) + sched_set_fifo(producer); + else if (producer_fifo == 1) + sched_set_fifo_low(producer); + else set_user_nice(producer, producer_nice); return 0; -- cgit v1.2.3 From e65855a52b479f98674998cb23b21ef5a8144b04 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 16 Jul 2020 12:03:47 +0100 Subject: sched/uclamp: Fix a deadlock when enabling uclamp static key The following splat was caught when setting uclamp value of a task: BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:49 cpus_read_lock+0x68/0x130 static_key_enable+0x1c/0x38 __sched_setscheduler+0x900/0xad8 Fix by ensuring we enable the key outside of the critical section in __sched_setscheduler() Fixes: 46609ce22703 ("sched/uclamp: Protect uclamp fast path code with static key") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716110347.19553-4-qais.yousef@arm.com --- kernel/sched/core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 678253450ebb..e44d83f3e0e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1346,6 +1346,15 @@ static int uclamp_validate(struct task_struct *p, if (upper_bound > SCHED_CAPACITY_SCALE) return -EINVAL; + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + return 0; } @@ -1376,8 +1385,6 @@ static void __setscheduler_uclamp(struct task_struct *p, if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; - static_branch_enable(&sched_uclamp_used); - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); -- cgit v1.2.3 From 13685c4a08fca9dd76bf53bfcbadc044ab2a08cb Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 16 Jul 2020 12:03:45 +0100 Subject: sched/uclamp: Add a new sysctl to control RT default boost value RT tasks by default run at the highest capacity/performance level. When uclamp is selected this default behavior is retained by enforcing the requested uclamp.min (p->uclamp_req[UCLAMP_MIN]) of the RT tasks to be uclamp_none(UCLAMP_MAX), which is SCHED_CAPACITY_SCALE; the maximum value. This is also referred to as 'the default boost value of RT tasks'. See commit 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks"). On battery powered devices, it is desired to control this default (currently hardcoded) behavior at runtime to reduce energy consumed by RT tasks. For example, a mobile device manufacturer where big.LITTLE architecture is dominant, the performance of the little cores varies across SoCs, and on high end ones the big cores could be too power hungry. Given the diversity of SoCs, the new knob allows manufactures to tune the best performance/power for RT tasks for the particular hardware they run on. They could opt to further tune the value when the user selects a different power saving mode or when the device is actively charging. The runtime aspect of it further helps in creating a single kernel image that can be run on multiple devices that require different tuning. Keep in mind that a lot of RT tasks in the system are created by the kernel. On Android for instance I can see over 50 RT tasks, only a handful of which created by the Android framework. To control the default behavior globally by system admins and device integrator, introduce the new sysctl_sched_uclamp_util_min_rt_default to change the default boost value of the RT tasks. I anticipate this to be mostly in the form of modifying the init script of a particular device. To avoid polluting the fast path with unnecessary code, the approach taken is to synchronously do the update by traversing all the existing tasks in the system. This could race with a concurrent fork(), which is dealt with by introducing sched_post_fork() function which will ensure the racy fork will get the right update applied. Tested on Juno-r2 in combination with the RT capacity awareness [1]. By default an RT task will go to the highest capacity CPU and run at the maximum frequency, which is particularly energy inefficient on high end mobile devices because the biggest core[s] are 'huge' and power hungry. With this patch the RT task can be controlled to run anywhere by default, and doesn't cause the frequency to be maximum all the time. Yet any task that really needs to be boosted can easily escape this default behavior by modifying its requested uclamp.min value (p->uclamp_req[UCLAMP_MIN]) via sched_setattr() syscall. [1] 804d402fb6f6: ("sched/rt: Make RT capacity-aware") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716110347.19553-2-qais.yousef@arm.com --- kernel/fork.c | 1 + kernel/sched/core.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++--- kernel/sysctl.c | 7 ++++ 3 files changed, 121 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index efc5493203ae..e75c2e41f3d1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2304,6 +2304,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e44d83f3e0e6..12e1f3a2cabc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -889,6 +889,23 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; + /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -991,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + rq = task_rq_lock(p, &rf); + __uclamp_update_util_min_rt_default(p); + task_rq_unlock(rq, p, &rf); +} + +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -1278,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) @@ -1292,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, goto done; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1313,6 +1391,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, uclamp_update_root_tg(); } + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } + /* * We update all RUNNABLE tasks only when task groups are in use. * Otherwise, keep it simple and do just a lazy update at each next @@ -1324,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; done: mutex_unlock(&uclamp_mutex); @@ -1369,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p, */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); /* Keep using defined clamps across class changes */ if (uc_se->user_defined) continue; - /* By default, RT tasks always get 100% boost */ + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); + __uclamp_update_util_min_rt_default(p); + else + uclamp_se_set(uc_se, uclamp_none(clamp_id), false); - uclamp_se_set(uc_se, clamp_value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) @@ -1400,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; + /* + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. + */ for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1412,6 +1503,11 @@ static void uclamp_fork(struct task_struct *p) } } +static void uclamp_post_fork(struct task_struct *p) +{ + uclamp_update_util_min_rt_default(p); +} + static void __init init_uclamp_rq(struct rq *rq) { enum uclamp_id clamp_id; @@ -1462,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -3205,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); +} + unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -5724,6 +5826,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aea67d3d552..1b4d2dc270a5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1815,6 +1815,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_sched_uclamp_handler, }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, #endif #ifdef CONFIG_SCHED_AUTOGROUP { -- cgit v1.2.3 From f891f19736bdf404845f97d8038054be37160ea8 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 29 Jul 2020 17:09:19 +0530 Subject: kexec_file: Allow archs to handle special regions while locating memory hole Some architectures may have special memory regions, within the given memory range, which can't be used for the buffer in a kexec segment. Implement weak arch_kexec_locate_mem_hole() definition which arch code may override, to take care of special regions, while trying to locate a memory hole. Also, add the missing declarations for arch overridable functions and and drop the __weak descriptors in the declarations to avoid non-weak definitions from becoming weak. Signed-off-by: Hari Bathini Tested-by: Pingfan Liu Reviewed-by: Thiago Jung Bauermann Acked-by: Dave Young Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/159602273603.575379.17665852963340380839.stgit@hbathini --- kernel/kexec_file.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bb05fd52de85..eb42d2efa16a 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -657,6 +657,19 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) return ret == 1 ? 0 : -EADDRNOTAVAIL; } +/** + * arch_kexec_locate_mem_hole - Find free memory to place the segments. + * @kbuf: Parameters for the memory search. + * + * On success, kbuf->mem will have the start address of the memory region found. + * + * Return: 0 on success, negative errno on error. + */ +int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) +{ + return kexec_locate_mem_hole(kbuf); +} + /** * kexec_add_buffer - place a buffer in a kexec segment * @kbuf: Buffer contents and memory parameters. @@ -669,7 +682,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) */ int kexec_add_buffer(struct kexec_buf *kbuf) { - struct kexec_segment *ksegment; int ret; @@ -697,7 +709,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); /* Walk the RAM ranges and allocate a suitable range for the buffer */ - ret = kexec_locate_mem_hole(kbuf); + ret = arch_kexec_locate_mem_hole(kbuf); if (ret) return ret; -- cgit v1.2.3 From 8ac68dc455d9d18241d44b96800d73229029ed34 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 28 Jul 2020 15:33:21 -0400 Subject: revert: 1320a4052ea1 ("audit: trigger accompanying records when no rules present") Unfortunately the commit listed in the subject line above failed to ensure that the task's audit_context was properly initialized/set before enabling the "accompanying records". Depending on the situation, the resulting audit_context could have invalid values in some of it's fields which could cause a kernel panic/oops when the task/syscall exists and the audit records are generated. We will revisit the original patch, with the necessary fixes, in a future kernel but right now we just want to fix the kernel panic with the least amount of added risk. Cc: stable@vger.kernel.org Fixes: 1320a4052ea1 ("audit: trigger accompanying records when no rules present") Reported-by: j2468h@googlemail.com Signed-off-by: Paul Moore --- kernel/audit.c | 1 - kernel/audit.h | 8 -------- kernel/auditsc.c | 3 +++ 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e33460e01b3b..9bf2b08b051f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1848,7 +1848,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_clear_dummy(ab->ctx); audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); diff --git a/kernel/audit.h b/kernel/audit.h index f0233dc40b17..ddc22878433d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -290,13 +290,6 @@ extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); - -static inline void audit_clear_dummy(struct audit_context *ctx) -{ - if (ctx) - ctx->dummy = 0; -} - #else /* CONFIG_AUDITSYSCALL */ #define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} @@ -330,7 +323,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) } #define audit_filter_inodes(t, c) AUDIT_DISABLED -#define audit_clear_dummy(c) {} #endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 468a23390457..fd840c40abf7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1417,6 +1417,9 @@ static void audit_log_proctitle(void) struct audit_context *context = audit_context(); struct audit_buffer *ab; + if (!context || context->dummy) + return; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ -- cgit v1.2.3 From b75058614fdd3140074a640b514f6a0b4d485a2d Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:19 +0200 Subject: sched: tasks: Use sequence counter with associated spinlock A sequence counter write side critical section must be protected by some form of locking to serialize writers. A plain seqcount_t does not contain the information of which lock must be held when entering a write side critical section. Use the new seqcount_spinlock_t data type, which allows to associate a spinlock with the sequence counter. This enables lockdep to verify that the spinlock used for writer serialization is held when the write side critical section is entered. If lockdep is disabled this lock association is compiled out and has neither storage size nor runtime overhead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-14-a.darwish@linutronix.de --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 70d9d0a4de2a..fc72f09a61b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2032,7 +2032,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; - seqcount_init(&p->mems_allowed_seq); + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; -- cgit v1.2.3 From 025e82bcbc34cd071390e72fd0b31593282f9425 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:23 +0200 Subject: timekeeping: Use sequence counter with associated raw spinlock A sequence counter write side critical section must be protected by some form of locking to serialize writers. A plain seqcount_t does not contain the information of which lock must be held when entering a write side critical section. Use the new seqcount_raw_spinlock_t data type, which allows to associate a raw spinlock with the sequence counter. This enables lockdep to verify that the raw spinlock used for writer serialization is held when the write side critical section is entered. If lockdep is disabled this lock association is compiled out and has neither storage size nor runtime overhead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-18-a.darwish@linutronix.de --- kernel/time/timekeeping.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d20d489841c8..05ecfd8a3314 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -39,18 +39,19 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); + /* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { - .seq = SEQCNT_ZERO(tk_core.seq), + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** @@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct tk_read_base base[2]; }; @@ -80,11 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the - * seqlock ensures we don't return a bad value while structures are updated, + * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if @@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) unsigned int seq; /* - * Since we're called holding a seqlock, the data may shift + * Since we're called holding a seqcount, the data may shift * under us while we're doing the calculation. This can cause * false positives, since we'd note a problem but throw the - * results away. So nest another seqlock here to atomically + * results away. So nest another seqcount here to atomically * grab the points we are checking with. */ do { @@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: + * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset -- cgit v1.2.3 From af5a06b582ec3d7b0160b4faaa65f73d8dcf989f Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:30 +0200 Subject: hrtimer: Use sequence counter with associated raw spinlock A sequence counter write side critical section must be protected by some form of locking to serialize writers. A plain seqcount_t does not contain the information of which lock must be held when entering a write side critical section. Use the new seqcount_raw_spinlock_t data type, which allows to associate a raw spinlock with the sequence counter. This enables lockdep to verify that the raw spinlock used for writer serialization is held when the write side critical section is entered. If lockdep is disabled this lock association is compiled out and has neither storage size nor runtime overhead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-25-a.darwish@linutronix.de --- kernel/time/hrtimer.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d89da1c7e005..c4038511d5c9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { .cpu_base = &migration_cpu_base, }, }, + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu) int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; -- cgit v1.2.3 From f227e3ec3b5cad859ad15666874405e8c1bbc1d4 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 10 Jul 2020 15:23:19 +0200 Subject: random32: update the net random state on interrupt and activity This modifies the first 32 bits out of the 128 bits of a random CPU's net_rand_state on interrupt or CPU activity to complicate remote observations that could lead to guessing the network RNG's internal state. Note that depending on some network devices' interrupt rate moderation or binding, this re-seeding might happen on every packet or even almost never. In addition, with NOHZ some CPUs might not even get timer interrupts, leaving their local state rarely updated, while they are running networked processes making use of the random state. For this reason, we also perform this update in update_process_times() in order to at least update the state when there is user or system activity, since it's the only case we care about. Reported-by: Amit Klein Suggested-by: Linus Torvalds Cc: Eric Dumazet Cc: "Jason A. Donenfeld" Cc: Andy Lutomirski Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- kernel/time/timer.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index df1ff803acc4..026ac01af9da 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1742,6 +1743,13 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); + + /* The current CPU might make use of net randoms without receiving IRQs + * to renew them often enough. Let's update the net_rand_state from a + * non-constant value that's not affine to the number of calls to make + * sure it's updated when there's some activity (we don't care in idle). + */ + this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** -- cgit v1.2.3 From 1d4e1eab456e1ee92a94987499b211db05f900ea Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Jul 2020 21:09:12 -0700 Subject: bpf: Fix map leak in HASH_OF_MAPS map Fix HASH_OF_MAPS bug of not putting inner map pointer on bpf_map_elem_update() operation. This is due to per-cpu extra_elems optimization, which bypassed free_htab_elem() logic doing proper clean ups. Make sure that inner map is put properly in optimized case as well. Fixes: 8c290e60fa2a ("bpf: fix hashmap extra_elems logic") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200729040913.2815687-1-andriin@fb.com --- kernel/bpf/hashtab.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b4b288a3c3c9..b32cc8ce8ff6 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -779,15 +779,20 @@ static void htab_elem_free_rcu(struct rcu_head *head) htab_elem_free(htab, l); } -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) { struct bpf_map *map = &htab->map; + void *ptr; if (map->ops->map_fd_put_ptr) { - void *ptr = fd_htab_map_get_ptr(map, l); - + ptr = fd_htab_map_get_ptr(map, l); map->ops->map_fd_put_ptr(ptr); } +} + +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { __pcpu_freelist_push(&htab->freelist, &l->fnode); @@ -839,6 +844,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, */ pl_new = this_cpu_ptr(htab->extra_elems); l_new = *pl_new; + htab_put_fd_value(htab, old_elem); *pl_new = old_elem; } else { struct pcpu_freelist_node *l; -- cgit v1.2.3 From b13fecb1c3a603c4b8e99b306fecf4f668c11b32 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Jul 2020 15:01:26 -0700 Subject: treewide: Replace DECLARE_TASKLET() with DECLARE_TASKLET_OLD() This converts all the existing DECLARE_TASKLET() (and ...DISABLED) macros with DECLARE_TASKLET_OLD() in preparation for refactoring the tasklet callback type. All existing DECLARE_TASKLET() users had a "0" data argument, it has been removed here as well. Reviewed-by: Greg Kroah-Hartman Acked-by: Thomas Gleixner Signed-off-by: Kees Cook --- kernel/backtracetest.c | 2 +- kernel/debug/debug_core.c | 2 +- kernel/irq/resend.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a2a97fa3071b..370217dd7e39 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data) complete(&backtrace_work); } -static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); +static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback); static void backtrace_test_irq(void) { diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9e5934780f41..b16dbc1bf056 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1068,7 +1068,7 @@ static void kgdb_tasklet_bpt(unsigned long ing) atomic_set(&kgdb_break_tasklet_var, 0); } -static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); +static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt); void kgdb_schedule_breakpoint(void) { diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 27634f4022d0..c48ce19a257f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg) } /* Tasklet to handle resend: */ -static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); +static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { -- cgit v1.2.3 From 12cc923f1ccc1df467e046b02a72c2b3b321b6a2 Mon Sep 17 00:00:00 2001 From: Romain Perier Date: Sun, 29 Sep 2019 18:30:13 +0200 Subject: tasklet: Introduce new initialization API Nowadays, modern kernel subsystems that use callbacks pass the data structure associated with a given callback as argument to the callback. The tasklet subsystem remains one which passes an arbitrary unsigned long to the callback function. This has several problems: - This keeps an extra field for storing the argument in each tasklet data structure, it bloats the tasklet_struct structure with a redundant .data field - No type checking can be performed on this argument. Instead of using container_of() like other callback subsystems, it forces callbacks to do explicit type cast of the unsigned long argument into the required object type. - Buffer overflows can overwrite the .func and the .data field, so an attacker can easily overwrite the function and its first argument to whatever it wants. Add a new tasklet initialization API, via DECLARE_TASKLET() and tasklet_setup(), which will replace the existing ones. This work is greatly inspired by the timer_struct conversion series, see commit e99e88a9d2b0 ("treewide: setup_timer() -> timer_setup()") To avoid problems with both -Wcast-function-type (which is enabled in the kernel via -Wextra is several subsystems), and with mismatched function prototypes when build with Control Flow Integrity enabled, this adds the "use_callback" member to let the tasklet caller choose which union member to call through. Once all old API uses are removed, this and the .data member will be removed as well. (On 64-bit this does not grow the struct size as the new member fills the hole after atomic_t, which is also "int" sized.) Signed-off-by: Romain Perier Co-developed-by: Allen Pais Signed-off-by: Allen Pais Reviewed-by: Greg Kroah-Hartman Acked-by: Thomas Gleixner Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- kernel/softirq.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index c4201b7f42b1..292e7c2d2333 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -547,7 +547,10 @@ static void tasklet_action_common(struct softirq_action *a, if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); - t->func(t->data); + if (t->use_callback) + t->callback(t); + else + t->func(t->data); tasklet_unlock(t); continue; } @@ -573,6 +576,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a) tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } +void tasklet_setup(struct tasklet_struct *t, + void (*callback)(struct tasklet_struct *)) +{ + t->next = NULL; + t->state = 0; + atomic_set(&t->count, 0); + t->callback = callback; + t->use_callback = true; + t->data = 0; +} +EXPORT_SYMBOL(tasklet_setup); + void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -580,6 +595,7 @@ void tasklet_init(struct tasklet_struct *t, t->state = 0; atomic_set(&t->count, 0); t->func = func; + t->use_callback = false; t->data = data; } EXPORT_SYMBOL(tasklet_init); -- cgit v1.2.3 From 4fc00b79b85d4c34bef06ad49f109ad7cd9e5d83 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 28 Jul 2020 15:18:01 -0700 Subject: bpf: Add missing newline characters in verifier error messages Newline characters are added in two verifier error messages, refactored in Commit afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier"). This way, they do not mix with messages afterwards. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200728221801.1090349-1-yhs@fb.com --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 88bb25d08bf8..b6ccfce3bf4c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3069,7 +3069,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env, { if (off < 0) { verbose(env, - "R%d invalid %s buffer access: off=%d, size=%d", + "R%d invalid %s buffer access: off=%d, size=%d\n", regno, buf_info, off, size); return -EACCES; } @@ -3078,7 +3078,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s", + "R%d invalid variable buffer offset: off=%d, var_off=%s\n", regno, off, tn_buf); return -EACCES; } -- cgit v1.2.3 From a9d0ba6772a0056457838022b624c3d2a17cee21 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 30 Jul 2020 16:23:17 +0800 Subject: tracing/hwlat: Drop the duplicate assignment in start_kthread() We have set 'current_mask' to '&save_cpumask' in its declaration, so there is no need to assign again. Link: https://lkml.kernel.org/r/20200730082318.42584-1-haokexin@gmail.com Signed-off-by: Kevin Hao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index e2be7bb7ef7e..ddb528a6cd51 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -371,7 +371,6 @@ static int start_kthread(struct trace_array *tr) return 0; /* Just pick the first CPU on first iteration */ - current_mask = &save_cpumask; get_online_cpus(); cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); put_online_cpus(); -- cgit v1.2.3 From 96b4833b6827a62c295b149213c68b559514c929 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 30 Jul 2020 16:23:18 +0800 Subject: tracing/hwlat: Honor the tracing_cpumask In calculation of the cpu mask for the hwlat kernel thread, the wrong cpu mask is used instead of the tracing_cpumask, this causes the tracing/tracing_cpumask useless for hwlat tracer. Fixes it. Link: https://lkml.kernel.org/r/20200730082318.42584-2-haokexin@gmail.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 0330f7aa8ee6 ("tracing: Have hwlat trace migrate across tracing_cpumask CPUs") Signed-off-by: Kevin Hao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index ddb528a6cd51..17873e5d0353 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -283,6 +283,7 @@ static bool disable_migrate; static void move_to_next_cpu(void) { struct cpumask *current_mask = &save_cpumask; + struct trace_array *tr = hwlat_trace; int next_cpu; if (disable_migrate) @@ -296,7 +297,7 @@ static void move_to_next_cpu(void) goto disable; get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); next_cpu = cpumask_next(smp_processor_id(), current_mask); put_online_cpus(); @@ -372,7 +373,7 @@ static int start_kthread(struct trace_array *tr) /* Just pick the first CPU on first iteration */ get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); put_online_cpus(); next_cpu = cpumask_first(current_mask); -- cgit v1.2.3 From 8a224ffb3f52b0027f6b7279854c71a31c48fc97 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 29 Jul 2020 02:05:53 +0800 Subject: ftrace: Setup correct FTRACE_FL_REGS flags for module When module loaded and enabled, we will use __ftrace_replace_code for module if any ftrace_ops referenced it found. But we will get wrong ftrace_addr for module rec in ftrace_get_addr_new, because rec->flags has not been setup correctly. It can cause the callback function of a ftrace_ops has FTRACE_OPS_FL_SAVE_REGS to be called with pt_regs set to NULL. So setup correct FTRACE_FL_REGS flags for rec when we call referenced_filters to find ftrace_ops references it. Link: https://lkml.kernel.org/r/20200728180554.65203-1-zhouchengming@bytedance.com Cc: stable@vger.kernel.org Fixes: 8c4f3c3fa9681 ("ftrace: Check module functions being traced on reload") Signed-off-by: Chengming Zhou Signed-off-by: Muchun Song Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c141d347f71a..d052f856f1cf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6198,8 +6198,11 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; + if (ops_references_rec(ops, rec)) { + cnt++; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; + } } return cnt; @@ -6378,8 +6381,8 @@ void ftrace_module_enable(struct module *mod) if (ftrace_start_up) cnt += referenced_filters(rec); - /* This clears FTRACE_FL_DISABLED */ - rec->flags = cnt; + rec->flags &= ~FTRACE_FL_DISABLED; + rec->flags += cnt; if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(rec, 1); -- cgit v1.2.3 From c5f51572a7fdff5be9e67c231de593155f394ab3 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 29 Jul 2020 02:05:54 +0800 Subject: ftrace: Do not let direct or IPMODIFY ftrace_ops be added to module and set trampolines When inserting a module, we find all ftrace_ops referencing it on the ftrace_ops_list. But FTRACE_OPS_FL_DIRECT and FTRACE_OPS_FL_IPMODIFY flags are special, and should not be set automatically. So warn and skip ftrace_ops that have these two flags set and adding new code. Also check if only one ftrace_ops references the module, in which case we can use a trampoline as an optimization. Link: https://lkml.kernel.org/r/20200728180554.65203-2-zhouchengming@bytedance.com Signed-off-by: Chengming Zhou Signed-off-by: Muchun Song Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d052f856f1cf..f433cb44300a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6199,9 +6199,17 @@ static int referenced_filters(struct dyn_ftrace *rec) for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { if (ops_references_rec(ops, rec)) { + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + continue; cnt++; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) rec->flags |= FTRACE_FL_REGS; + if (cnt == 1 && ops->trampoline) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; } } -- cgit v1.2.3 From ee896ee8051a618ea4fe3d91c58d3e6ef9faf705 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 27 Jul 2020 11:28:40 +0200 Subject: tracing: Remove outdated comment in stack handling This comment describes the behaviour before commit 2a820bf74918 ("tracing: Use percpu stack trace buffer more intelligently"). Since that commit, interrupts and NMIs do use the per-cpu stacks so the comment is no longer correct. Remove it. (Note that the FTRACE_STACK_SIZE mentioned in the comment has never existed, it probably should have said FTRACE_STACK_ENTRIES.) Link: https://lkml.kernel.org/r/20200727092840.18659-1-vincent.whitchurch@axis.com Signed-off-by: Vincent Whitchurch Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4aab712f9567..dbcacdd56b02 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2930,12 +2930,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, skip++; #endif - /* - * Since events can happen in NMIs there's no safe way to - * use the per cpu ftrace_stacks. We reserve it and if an interrupt - * or NMI comes in, it will just have to use the default - * FTRACE_STACK_SIZE. - */ preempt_disable_notrace(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; -- cgit v1.2.3 From 0584df9c12f449124d0bfef9899e5365604ee7a9 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jul 2020 13:09:15 +0200 Subject: lockdep: Refactor IRQ trace events fields into struct Refactor the IRQ trace events fields, used for printing information about the IRQ trace events, into a separate struct 'irqtrace_events'. This improves readability by separating the information only used in reporting, as well as enables (simplified) storing/restoring of irqtrace_events snapshots. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200729110916.3920464-1-elver@google.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 16 +++++-------- kernel/locking/lockdep.c | 58 +++++++++++++++++++++++++----------------------- 2 files changed, 35 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 70d9d0a4de2a..56a640799680 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2035,17 +2035,11 @@ static __latent_entropy struct task_struct *copy_process( seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS - p->irq_events = 0; - p->hardirq_enable_ip = 0; - p->hardirq_enable_event = 0; - p->hardirq_disable_ip = _THIS_IP_; - p->hardirq_disable_event = 0; - p->softirqs_enabled = 1; - p->softirq_enable_ip = _THIS_IP_; - p->softirq_enable_event = 0; - p->softirq_disable_ip = 0; - p->softirq_disable_event = 0; - p->softirq_context = 0; + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); + p->irqtrace.hardirq_disable_ip = _THIS_IP_; + p->irqtrace.softirq_enable_ip = _THIS_IP_; + p->softirqs_enabled = 1; + p->softirq_context = 0; #endif p->pagefault_disabled = 0; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c9ea05edce25..7b5800374c40 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3484,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { - printk("irq event stamp: %u\n", curr->irq_events); + const struct irqtrace_events *trace = &curr->irqtrace; + + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", - curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, - (void *)curr->hardirq_enable_ip); + trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, + (void *)trace->hardirq_enable_ip); printk("hardirqs last disabled at (%u): [<%px>] %pS\n", - curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, - (void *)curr->hardirq_disable_ip); + trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip, + (void *)trace->hardirq_disable_ip); printk("softirqs last enabled at (%u): [<%px>] %pS\n", - curr->softirq_enable_event, (void *)curr->softirq_enable_ip, - (void *)curr->softirq_enable_ip); + trace->softirq_enable_event, (void *)trace->softirq_enable_ip, + (void *)trace->softirq_enable_ip); printk("softirqs last disabled at (%u): [<%px>] %pS\n", - curr->softirq_disable_event, (void *)curr->softirq_disable_ip, - (void *)curr->softirq_disable_ip); + trace->softirq_disable_event, (void *)trace->softirq_disable_ip, + (void *)trace->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3699,7 +3701,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); void noinstr lockdep_hardirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks)) return; @@ -3752,8 +3754,8 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ this_cpu_write(hardirqs_enabled, 1); - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; + trace->hardirq_enable_ip = ip; + trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); } EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); @@ -3763,8 +3765,6 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); */ void noinstr lockdep_hardirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks)) return; @@ -3784,12 +3784,14 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) return; if (lockdep_hardirqs_enabled()) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ this_cpu_write(hardirqs_enabled, 0); - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; + trace->hardirq_disable_ip = ip; + trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); } else { debug_atomic_inc(redundant_hardirqs_off); @@ -3802,7 +3804,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); */ void lockdep_softirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3814,7 +3816,7 @@ void lockdep_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { debug_atomic_inc(redundant_softirqs_on); return; } @@ -3823,9 +3825,9 @@ void lockdep_softirqs_on(unsigned long ip) /* * We'll do an OFF -> ON transition: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; + current->softirqs_enabled = 1; + trace->softirq_enable_ip = ip; + trace->softirq_enable_event = ++trace->irq_events; debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the @@ -3833,7 +3835,7 @@ void lockdep_softirqs_on(unsigned long ip) * enabled too: */ if (lockdep_hardirqs_enabled()) - mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); + mark_held_locks(current, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3842,8 +3844,6 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3853,13 +3853,15 @@ void lockdep_softirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; + current->softirqs_enabled = 0; + trace->softirq_disable_ip = ip; + trace->softirq_disable_event = ++trace->irq_events; debug_atomic_inc(softirqs_off_events); /* * Whoops, we wanted softirqs off, so why aren't they? -- cgit v1.2.3 From 92c209ac6d3d35783c16c8a717547183e6e11162 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jul 2020 13:09:16 +0200 Subject: kcsan: Improve IRQ state trace reporting To improve the general usefulness of the IRQ state trace events with KCSAN enabled, save and restore the trace information when entering and exiting the KCSAN runtime as well as when generating a KCSAN report. Without this, reporting the IRQ trace events (whether via a KCSAN report or outside of KCSAN via a lockdep report) is rather useless due to continuously being touched by KCSAN. This is because if KCSAN is enabled, every instrumented memory access causes changes to IRQ trace events (either by KCSAN disabling/enabling interrupts or taking report_lock when generating a report). Before "lockdep: Prepare for NMI IRQ state tracking", KCSAN avoided touching the IRQ trace events via raw_local_irq_save/restore() and lockdep_off/on(). Fixes: 248591f5d257 ("kcsan: Make KCSAN compatible with new IRQ state tracking") Signed-off-by: Marco Elver Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200729110916.3920464-2-elver@google.com Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 23 +++++++++++++++++++++++ kernel/kcsan/kcsan.h | 7 +++++++ kernel/kcsan/report.c | 3 +++ 3 files changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 732623c30359..0fe068192781 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -291,6 +291,20 @@ static inline unsigned int get_delay(void) 0); } +void kcsan_save_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->kcsan_save_irqtrace = task->irqtrace; +#endif +} + +void kcsan_restore_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->irqtrace = task->kcsan_save_irqtrace; +#endif +} + /* * Pull everything together: check_access() below contains the performance * critical operations; the fast-path (including check_access) functions should @@ -336,9 +350,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { + kcsan_save_irqtrace(current); kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, KCSAN_REPORT_CONSUMED_WATCHPOINT, watchpoint - watchpoints); + kcsan_restore_irqtrace(current); } else { /* * The other thread may not print any diagnostics, as it has @@ -396,6 +412,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; } + /* + * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's + * runtime is entered for every memory access, and potentially useful + * information is lost if dirtied by KCSAN. + */ + kcsan_save_irqtrace(current); if (!kcsan_interrupt_watcher) local_irq_save(irq_flags); @@ -539,6 +561,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); + kcsan_restore_irqtrace(current); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 763d6d08d94b..29480010dc30 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -9,6 +9,7 @@ #define _KERNEL_KCSAN_KCSAN_H #include +#include /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 @@ -22,6 +23,12 @@ extern unsigned int kcsan_udelay_interrupt; */ extern bool kcsan_enabled; +/* + * Save/restore IRQ flags state trace dirtied by KCSAN. + */ +void kcsan_save_irqtrace(struct task_struct *task); +void kcsan_restore_irqtrace(struct task_struct *task); + /* * Initialize debugfs file. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 6b2fb1a6d8cd..9d07e175de0f 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -308,6 +308,9 @@ static void print_verbose_info(struct task_struct *task) if (!task) return; + /* Restore IRQ state trace for printing. */ + kcsan_restore_irqtrace(task); + pr_err("\n"); debug_show_held_locks(task); print_irqtrace_events(task); -- cgit v1.2.3 From f4470cdf108f00533e8079b19434e6cb48c17fa3 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 31 Jul 2020 20:20:14 +0100 Subject: sched: Document arch_scale_*_capacity() Rather that hide their purpose in some dark, damp corner of Documentation/, add some documentation to the default implementations. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200731192016.7484-2-valentin.schneider@arm.com --- kernel/sched/sched.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 296efd30d8c9..3fd283892761 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2049,6 +2049,16 @@ void arch_scale_freq_tick(void) #endif #ifndef arch_scale_freq_capacity +/** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * f_curr + * ------ * SCHED_CAPACITY_SCALE + * f_max + */ static __always_inline unsigned long arch_scale_freq_capacity(int cpu) { -- cgit v1.2.3 From 7ef5264de773279b9f23b6cc8afb5addb30e970b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:20 +0200 Subject: modules: mark ref_module static ref_module isn't used anywhere outside of module.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..baae0e83d630 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -869,7 +869,7 @@ static int add_module_usage(struct module *a, struct module *b) } /* Module a uses b: caller needs module_mutex() */ -int ref_module(struct module *a, struct module *b) +static int ref_module(struct module *a, struct module *b) { int err; @@ -888,7 +888,6 @@ int ref_module(struct module *a, struct module *b) } return 0; } -EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) @@ -1169,11 +1168,10 @@ static inline void module_unload_free(struct module *mod) { } -int ref_module(struct module *a, struct module *b) +static int ref_module(struct module *a, struct module *b) { return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(ref_module); static inline int module_unload_init(struct module *mod) { -- cgit v1.2.3 From 773110470e2fa3839523384ae014f8a723c4d178 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:21 +0200 Subject: modules: mark find_symbol static find_symbol is only used in module.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index baae0e83d630..0f95fb4b3e37 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -585,7 +585,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, /* Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -const struct kernel_symbol *find_symbol(const char *name, +static const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const s32 **crc, bool gplok, @@ -608,7 +608,6 @@ const struct kernel_symbol *find_symbol(const char *name, pr_debug("Failed to find symbol %s\n", name); return NULL; } -EXPORT_SYMBOL_GPL(find_symbol); /* * Search for module by name: must hold module_mutex (or preempt disabled -- cgit v1.2.3 From a54e04914c211b5678602a46b3ede5d82ec1327d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:22 +0200 Subject: modules: mark each_symbol_section static each_symbol_section is only used inside of module.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0f95fb4b3e37..c2a099a27b68 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr, } /* Returns true as soon as fn returns true, otherwise false. */ -bool each_symbol_section(bool (*fn)(const struct symsearch *arr, +static bool each_symbol_section(bool (*fn)(const struct symsearch *arr, struct module *owner, void *data), void *data) @@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, } return false; } -EXPORT_SYMBOL_GPL(each_symbol_section); struct find_symbol_arg { /* Input */ -- cgit v1.2.3 From 3fe1e56d0e68b623dd62d8d38265d2a052e7e185 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:23 +0200 Subject: modules: unexport __module_text_address __module_text_address is only used by built-in code. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c2a099a27b68..6ee1739e3150 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4503,7 +4503,6 @@ struct module *__module_text_address(unsigned long addr) } return mod; } -EXPORT_SYMBOL_GPL(__module_text_address); /* Don't grab lock, we're oopsing. */ void print_modules(void) -- cgit v1.2.3 From 34e64705ad415ed7a816e60ef62b42fe6d1729d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:24 +0200 Subject: modules: unexport __module_address __module_address is only used by built-in code. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6ee1739e3150..e85d06158fbc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4464,7 +4464,6 @@ struct module *__module_address(unsigned long addr) } return mod; } -EXPORT_SYMBOL_GPL(__module_address); /* * is_module_text_address - is this address inside module code? -- cgit v1.2.3 From cd8732cdcc37d7077c4fa2c966b748c0662b607e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:25 +0200 Subject: modules: rename the licence field in struct symsearch to license Use the same spelling variant as the rest of the file. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e85d06158fbc..62d817a0dca8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -504,9 +504,9 @@ static bool check_exported_symbol(const struct symsearch *syms, struct find_symbol_arg *fsa = data; if (!fsa->gplok) { - if (syms->licence == GPL_ONLY) + if (syms->license == GPL_ONLY) return false; - if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { + if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) { pr_warn("Symbol %s is being used by a non-GPL module, " "which will not be allowed in the future\n", fsa->name); -- cgit v1.2.3 From ef1dac6021cc8ec5de02ce31722bf26ac4ed5523 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:26 +0200 Subject: modules: return licensing information from find_symbol Report the GPLONLY status through a new argument. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 62d817a0dca8..656f5ff27088 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -495,6 +495,7 @@ struct find_symbol_arg { struct module *owner; const s32 *crc; const struct kernel_symbol *sym; + enum mod_license license; }; static bool check_exported_symbol(const struct symsearch *syms, @@ -528,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms, fsa->owner = owner; fsa->crc = symversion(syms->crcs, symnum); fsa->sym = &syms->start[symnum]; + fsa->license = syms->license; return true; } @@ -587,6 +589,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, static const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const s32 **crc, + enum mod_license *license, bool gplok, bool warn) { @@ -601,6 +604,8 @@ static const struct kernel_symbol *find_symbol(const char *name, *owner = fsa.owner; if (crc) *crc = fsa.crc; + if (license) + *license = fsa.license; return fsa.sym; } @@ -1074,7 +1079,7 @@ void __symbol_put(const char *symbol) struct module *owner; preempt_disable(); - if (!find_symbol(symbol, &owner, NULL, true, false)) + if (!find_symbol(symbol, &owner, NULL, NULL, true, false)) BUG(); module_put(owner); preempt_enable(); @@ -1352,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info, * locking is necessary -- use preempt_disable() to placate lockdep. */ preempt_disable(); - if (!find_symbol("module_layout", NULL, &crc, true, false)) { + if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) { preempt_enable(); BUG(); } @@ -1436,6 +1441,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, struct module *owner; const struct kernel_symbol *sym; const s32 *crc; + enum mod_license license; int err; /* @@ -1445,7 +1451,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, */ sched_annotate_sleep(); mutex_lock(&module_mutex); - sym = find_symbol(name, &owner, &crc, + sym = find_symbol(name, &owner, &crc, &license, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!sym) goto unlock; @@ -2213,7 +2219,7 @@ void *__symbol_get(const char *symbol) const struct kernel_symbol *sym; preempt_disable(); - sym = find_symbol(symbol, &owner, NULL, true, true); + sym = find_symbol(symbol, &owner, NULL, NULL, true, true); if (sym && strong_try_module_get(owner)) sym = NULL; preempt_enable(); @@ -2249,7 +2255,7 @@ static int verify_exported_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { if (find_symbol(kernel_symbol_name(s), &owner, NULL, - true, false)) { + NULL, true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", mod->name, kernel_symbol_name(s), -- cgit v1.2.3 From 73b11c2ab072d5b0599d1e12cc126f55ee306daf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 31 Jul 2020 11:28:26 -0700 Subject: bpf: Add support for forced LINK_DETACH command Add LINK_DETACH command to force-detach bpf_link without destroying it. It has the same behavior as auto-detaching of bpf_link due to cgroup dying for bpf_cgroup_link or net_device being destroyed for bpf_xdp_link. In such case, bpf_link is still a valid kernel object, but is defuncts and doesn't hold BPF program attached to corresponding BPF hook. This functionality allows users with enough access rights to manually force-detach attached bpf_link without killing respective owner process. This patch implements LINK_DETACH for cgroup, xdp, and netns links, mostly re-using existing link release handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200731182830.286260-2-andriin@fb.com --- kernel/bpf/cgroup.c | 15 ++++++++++++++- kernel/bpf/net_namespace.c | 8 ++++++++ kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 957cce1d5168..83ff127ef7ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -814,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); + struct cgroup *cg; /* link might have been auto-detached by dying cgroup already, * in that case our work is done here @@ -832,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + cg = cg_link->cgroup; + cg_link->cgroup = NULL; + mutex_unlock(&cgroup_mutex); - cgroup_put(cg_link->cgroup); + + cgroup_put(cg); } static void bpf_cgroup_link_dealloc(struct bpf_link *link) @@ -844,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } +static int bpf_cgroup_link_detach(struct bpf_link *link) +{ + bpf_cgroup_link_release(link); + + return 0; +} + static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { @@ -883,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .detach = bpf_cgroup_link_detach, .update_prog = cgroup_bpf_replace, .show_fdinfo = bpf_cgroup_link_show_fdinfo, .fill_link_info = bpf_cgroup_link_fill_link_info, diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 71405edd667c..542f275bf252 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -142,9 +142,16 @@ static void bpf_netns_link_release(struct bpf_link *link) bpf_prog_array_free(old_array); out_unlock: + net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } +static int bpf_netns_link_detach(struct bpf_link *link) +{ + bpf_netns_link_release(link); + return 0; +} + static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -228,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, + .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd3d599e9e90..2f343ce15747 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3991,6 +3991,29 @@ out_put_link: return ret; } +#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd + +static int link_detach(union bpf_attr *attr) +{ + struct bpf_link *link; + int ret; + + if (CHECK_ATTR(BPF_LINK_DETACH)) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_detach.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (link->ops->detach) + ret = link->ops->detach(link); + else + ret = -EOPNOTSUPP; + + bpf_link_put(link); + return ret; +} + static int bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; @@ -4240,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ITER_CREATE: err = bpf_iter_create(&attr); break; + case BPF_LINK_DETACH: + err = link_detach(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From c6fe44d96fc1536af5b11cd859686453d1b7bfd1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Jul 2020 12:33:41 -0700 Subject: list: add "list_del_init_careful()" to go with "list_empty_careful()" That gives us ordering guarantees around the pair. Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index ba059fbfc53a..01f5d3020589 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -389,7 +389,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } -- cgit v1.2.3 From 0f69dae4d1069963fdcc16e63655927d83281006 Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Fri, 31 Jul 2020 08:27:45 +0800 Subject: trace : Have tracing buffer info use kvzalloc instead of kzalloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit High order memory stuff within trace could introduce OOM, use kvzalloc instead. Please find the bellowing for the call stack we run across in an android system. The scenario happens when traced_probes is woken up to get a large quantity of trace even if free memory is even higher than watermark_low.  traced_probes invoked oom-killer: gfp_mask=0x140c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null), order=2, oom_score_adj=-1 traced_probes cpuset=system-background mems_allowed=0 CPU: 3 PID: 588 Comm: traced_probes Tainted: G W O 4.14.181 #1 Hardware name: Generic DT based system (unwind_backtrace) from [] (show_stack+0x20/0x24) (show_stack) from [] (dump_stack+0xa8/0xec) (dump_stack) from [] (dump_header+0x9c/0x220) (dump_header) from [] (oom_kill_process+0xc0/0x5c4) (oom_kill_process) from [] (out_of_memory+0x220/0x310) (out_of_memory) from [] (__alloc_pages_nodemask+0xff8/0x13a4) (__alloc_pages_nodemask) from [] (kmalloc_order+0x30/0x48) (kmalloc_order) from [] (kmalloc_order_trace+0x30/0x118) (kmalloc_order_trace) from [] (tracing_buffers_open+0x50/0xfc) (tracing_buffers_open) from [] (do_dentry_open+0x278/0x34c) (do_dentry_open) from [] (vfs_open+0x50/0x70) (vfs_open) from [] (path_openat+0x5fc/0x169c) (path_openat) from [] (do_filp_open+0x94/0xf8) (do_filp_open) from [] (do_sys_open+0x168/0x26c) (do_sys_open) from [] (SyS_openat+0x34/0x38) (SyS_openat) from [] (ret_fast_syscall+0x0/0x28) Link: https://lkml.kernel.org/r/1596155265-32365-1-git-send-email-zhaoyang.huang@unisoc.com Signed-off-by: Zhaoyang Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dbcacdd56b02..06c0feae5ff9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7402,7 +7402,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (ret) return ret; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kvzalloc(sizeof(*info), GFP_KERNEL); if (!info) { trace_array_put(tr); return -ENOMEM; @@ -7528,7 +7528,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); - kfree(info); + kvfree(info); mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From c58b6b0372de0d4cd0536d6585addd1b36b151ae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jul 2020 20:50:48 -0400 Subject: ftrace: Fix ftrace_trace_task return value I was attempting to use pid filtering with function_graph, but it wasn't allowing anything to make it through. Turns out ftrace_trace_task returns false if ftrace_ignore_pid is not-empty, which isn't correct anymore. We're now setting it to FTRACE_PID_IGNORE if we need to ignore that pid, otherwise it's set to the pid (which is weird considering the name) or to FTRACE_PID_TRACE. Fix the check to check for != FTRACE_PID_IGNORE. With this we can now use function_graph with pid filtering. Link: https://lkml.kernel.org/r/20200725005048.1790-1-josef@toxicpanda.com Fixes: 717e3f5ebc82 ("ftrace: Make function trace pid filtering a bit more exact") Signed-off-by: Josef Bacik Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 --- kernel/trace/trace.h | 7 ++++++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f433cb44300a..4e3a5d79c078 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } -#define FTRACE_PID_IGNORE -1 -#define FTRACE_PID_TRACE -2 - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f21607f87189..610d21355526 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1103,6 +1103,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_PID_IGNORE -1 +#define FTRACE_PID_TRACE -2 + struct ftrace_func_command { struct list_head list; char *name; @@ -1114,7 +1118,8 @@ struct ftrace_func_command { extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { - return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); + return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != + FTRACE_PID_IGNORE; } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, -- cgit v1.2.3 From 0cb2f1372baa60af8456388a574af6133edd7d80 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jul 2020 14:45:36 +0800 Subject: kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler We found a case of kernel panic on our server. The stack trace is as follows(omit some irrelevant information): BUG: kernel NULL pointer dereference, address: 0000000000000080 RIP: 0010:kprobe_ftrace_handler+0x5e/0xe0 RSP: 0018:ffffb512c6550998 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8e9d16eea018 RCX: 0000000000000000 RDX: ffffffffbe1179c0 RSI: ffffffffc0535564 RDI: ffffffffc0534ec0 RBP: ffffffffc0534ec1 R08: ffff8e9d1bbb0f00 R09: 0000000000000004 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff8e9d1f797060 R14: 000000000000bacc R15: ffff8e9ce13eca00 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000080 CR3: 00000008453d0005 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ftrace_ops_assist_func+0x56/0xe0 ftrace_call+0x5/0x34 tcpa_statistic_send+0x5/0x130 [ttcp_engine] The tcpa_statistic_send is the function being kprobed. After analysis, the root cause is that the fourth parameter regs of kprobe_ftrace_handler is NULL. Why regs is NULL? We use the crash tool to analyze the kdump. crash> dis tcpa_statistic_send -r : callq 0xffffffffbd8018c0 The tcpa_statistic_send calls ftrace_caller instead of ftrace_regs_caller. So it is reasonable that the fourth parameter regs of kprobe_ftrace_handler is NULL. In theory, we should call the ftrace_regs_caller instead of the ftrace_caller. After in-depth analysis, we found a reproducible path. Writing a simple kernel module which starts a periodic timer. The timer's handler is named 'kprobe_test_timer_handler'. The module name is kprobe_test.ko. 1) insmod kprobe_test.ko 2) bpftrace -e 'kretprobe:kprobe_test_timer_handler {}' 3) echo 0 > /proc/sys/kernel/ftrace_enabled 4) rmmod kprobe_test 5) stop step 2) kprobe 6) insmod kprobe_test.ko 7) bpftrace -e 'kretprobe:kprobe_test_timer_handler {}' We mark the kprobe as GONE but not disarm the kprobe in the step 4). The step 5) also do not disarm the kprobe when unregister kprobe. So we do not remove the ip from the filter. In this case, when the module loads again in the step 6), we will replace the code to ftrace_caller via the ftrace_module_enable(). When we register kprobe again, we will not replace ftrace_caller to ftrace_regs_caller because the ftrace is disabled in the step 3). So the step 7) will trigger kernel panic. Fix this problem by disarming the kprobe when the module is going away. Link: https://lkml.kernel.org/r/20200728064536.24405-1-songmuchun@bytedance.com Cc: stable@vger.kernel.org Fixes: ae6aa16fdc16 ("kprobes: introduce ftrace based optimization") Acked-by: Masami Hiramatsu Signed-off-by: Muchun Song Co-developed-by: Chengming Zhou Signed-off-by: Chengming Zhou Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..07bf03fcf574 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2113,6 +2113,13 @@ static void kill_kprobe(struct kprobe *p) * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); + + /* + * The module is going away. We should disarm the kprobe which + * is using ftrace. + */ + if (kprobe_ftrace(p)) + disarm_kprobe_ftrace(p); } /* Disable one kprobe */ -- cgit v1.2.3 From 231621d0c570765d5cebd95c582f1c8df5c46028 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 24 Jul 2020 11:24:24 +0800 Subject: tracing/uprobe: Remove dead code in trace_uprobe_register() In the function trace_uprobe_register(), the statement "return 0;" out of switch case is dead code, remove it. Link: https://lkml.kernel.org/r/1595561064-29186-1-git-send-email-fanpeng@loongson.cn Signed-off-by: Peng Fan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fdd47f99b18f..f4286c9bdeb4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1456,7 +1456,6 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, default: return 0; } - return 0; } static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) -- cgit v1.2.3 From afcab636657421f7ebfa0783a91f90256bba0091 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Aug 2020 20:00:02 -0400 Subject: tracing: Use trace_sched_process_free() instead of exit() for pid tracing On exit, if a process is preempted after the trace_sched_process_exit() tracepoint but before the process is done exiting, then when it gets scheduled in, the function tracers will not filter it properly against the function tracing pid filters. That is because the function tracing pid filters hooks to the sched_process_exit() tracepoint to remove the exiting task's pid from the filter list. Because the filtering happens at the sched_switch tracepoint, when the exiting task schedules back in to finish up the exit, it will no longer be in the function pid filtering tables. This was noticeable in the notrace self tests on a preemptable kernel, as the tests would fail as it exits and preempted after being taken off the notrace filter table and on scheduling back in it would not be in the notrace list, and then the ending of the exit function would trace. The test detected this and would fail. Cc: stable@vger.kernel.org Cc: Namhyung Kim Fixes: 1e10486ffee0a ("ftrace: Add 'function-fork' trace option") Fixes: c37775d57830a ("tracing: Add infrastructure to allow set_event_pid to follow children" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace_events.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4e3a5d79c078..76f2dd6fd414 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6985,12 +6985,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } else { unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f6f55682d3e2..a85effb2373b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -538,12 +538,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, tr, INT_MIN); - register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit, + register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, tr, INT_MAX); } else { unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, tr); - unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit, + unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, tr); } } -- cgit v1.2.3 From 262e6ae7081df304fc625cf368d5c2cbba2bb991 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jul 2020 23:33:33 +0200 Subject: modules: inherit TAINT_PROPRIETARY_MODULE If a TAINT_PROPRIETARY_MODULE exports symbol, inherit the taint flag for all modules importing these symbols, and don't allow loading symbols from TAINT_PROPRIETARY_MODULE modules if the module previously imported gplonly symbols. Add a anti-circumvention devices so people don't accidentally get themselves into trouble this way. Comment from Greg: "Ah, the proven-to-be-illegal "GPL Condom" defense :)" [jeyu: pr_info -> pr_err and pr_warn as per discussion] Link: http://lore.kernel.org/r/20200730162957.GA22469@lst.de Acked-by: Daniel Vetter Reviewed-by: Greg Kroah-Hartman Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 656f5ff27088..09bf5a652a47 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1431,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info, return 0; } +static bool inherit_taint(struct module *mod, struct module *owner) +{ + if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) + return true; + + if (mod->using_gplonly_symbols) { + pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", + mod->name, owner->name); + return false; + } + + if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { + pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", + mod->name, owner->name); + set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); + } + return true; +} /* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(struct module *mod, @@ -1456,6 +1474,14 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (!sym) goto unlock; + if (license == GPL_ONLY) + mod->using_gplonly_symbols = true; + + if (!inherit_taint(mod, owner)) { + sym = NULL; + goto getname; + } + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; -- cgit v1.2.3 From a1bd06853ee478d37fae9435c5521e301de94c67 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Wed, 5 Aug 2020 16:31:38 -0400 Subject: sched: Fix use of count for nr_running tracepoint The count field is meant to tell if an update to nr_running is an add or a subtract. Make it do so by adding the missing minus sign. Fixes: 9d246053a691 ("sched: Add a tracepoint to track rq->nr_running") Signed-off-by: Phil Auld Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200805203138.1411-1-pauld@redhat.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3fd283892761..28709f6b0975 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; if (trace_sched_update_nr_running_tp_enabled()) { - call_trace_sched_update_nr_running(rq, count); + call_trace_sched_update_nr_running(rq, -count); } /* Check if we still need preemption */ -- cgit v1.2.3 From 19d0070a2792181f79df01277fe00b83b9f7eda7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Aug 2020 17:01:23 +0200 Subject: timekeeping/vsyscall: Provide vdso_update_begin/end() Architectures can have the requirement to add additional architecture specific data to the VDSO data page which needs to be updated independent of the timekeeper updates. To protect these updates vs. concurrent readers and a conflicting update through timekeeping, provide helper functions to make such updates safe. vdso_update_begin() takes the timekeeper_lock to protect against a potential update from timekeeper code and increments the VDSO sequence count to signal data inconsistency to concurrent readers. vdso_update_end() makes the sequence count even again to signal data consistency and drops the timekeeper lock. [ Sven: Add interrupt disable handling to the functions ] Signed-off-by: Thomas Gleixner Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200804150124.41692-3-svens@linux.ibm.com --- kernel/time/timekeeping.c | 2 +- kernel/time/timekeeping_internal.h | 11 +++++++--- kernel/time/vsyscall.c | 41 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63a632f9896c..4c7212f3c603 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -50,7 +50,7 @@ static struct { .seq = SEQCNT_ZERO(tk_core.seq), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); +DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index bcbb52db2256..4ca2787d1642 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TIMEKEEPING_INTERNAL_H #define _TIMEKEEPING_INTERNAL_H -/* - * timekeeping debug functions - */ + #include +#include #include +/* + * timekeeping debug functions + */ #ifdef CONFIG_DEBUG_FS extern void tk_debug_account_sleep_time(const struct timespec64 *t); #else @@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif +/* Semi public for serialization of non timekeeper VDSO updates. */ +extern raw_spinlock_t timekeeper_lock; + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 54ce6eb2ca36..88e6b8ed6ca5 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -13,6 +13,8 @@ #include #include +#include "timekeeping_internal.h" + static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { @@ -127,3 +129,42 @@ void update_vsyscall_tz(void) __arch_sync_vdso_data(vdata); } + +/** + * vdso_update_begin - Start of a VDSO update section + * + * Allows architecture code to safely update the architecture specific VDSO + * data. Disables interrupts, acquires timekeeper lock to serialize against + * concurrent updates from timekeeping and invalidates the VDSO data + * sequence counter to prevent concurrent readers from accessing + * inconsistent data. + * + * Returns: Saved interrupt flags which need to be handed in to + * vdso_update_end(). + */ +unsigned long vdso_update_begin(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + vdso_write_begin(vdata); + return flags; +} + +/** + * vdso_update_end - End of a VDSO update section + * @flags: Interrupt flags as returned from vdso_update_begin() + * + * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data + * synchronization if the architecture requires it, drops timekeeper lock + * and restores interrupt flags. + */ +void vdso_update_end(unsigned long flags) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + vdso_write_end(vdata); + __arch_sync_vdso_data(vdata); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} -- cgit v1.2.3 From 45fd22da97c6125d8d0d35bd1791e7c0c4175279 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Wed, 5 Aug 2020 10:56:56 +0300 Subject: perf/core: Take over CAP_SYS_PTRACE creds to CAP_PERFMON capability Open access to per-process monitoring for CAP_PERFMON only privileged processes [1]. Extend ptrace_may_access() check in perf_events subsystem with perfmon_capable() to simplify user experience and make monitoring more secure by reducing attack surface. [1] https://lore.kernel.org/lkml/7776fa40-6c65-2aa6-1322-eb3a01201000@linux.intel.com/ Signed-off-by: Alexey Budankov Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/6e8392ff-4732-0012-2949-e1587709f0f6@linux.intel.com --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 78e69e10482a..41e0cefb429b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11689,7 +11689,7 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; /* - * Reuse ptrace permission checks for now. + * Preserve ptrace permission check for backwards compatibility. * * We must hold exec_update_mutex across this and any potential * perf_install_in_context() call for this new event to @@ -11697,7 +11697,7 @@ SYSCALL_DEFINE5(perf_event_open, * perf_event_exit_task() that could imply). */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto err_cred; } -- cgit v1.2.3 From 10de795a5addd1962406796a6e13ba6cc0fc6bee Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 6 Aug 2020 01:20:46 +0800 Subject: kprobes: Fix compiler warning for !CONFIG_KPROBES_ON_FTRACE Fix compiler warning(as show below) for !CONFIG_KPROBES_ON_FTRACE. kernel/kprobes.c: In function 'kill_kprobe': kernel/kprobes.c:1116:33: warning: statement with no effect [-Wunused-value] 1116 | #define disarm_kprobe_ftrace(p) (-ENODEV) | ^ kernel/kprobes.c:2154:3: note: in expansion of macro 'disarm_kprobe_ftrace' 2154 | disarm_kprobe_ftrace(p); Link: https://lore.kernel.org/r/20200805142136.0331f7ea@canb.auug.org.au Link: https://lkml.kernel.org/r/20200805172046.19066-1-songmuchun@bytedance.com Reported-by: Stephen Rothwell Fixes: 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") Acked-by: Masami Hiramatsu Acked-by: John Fastabend Signed-off-by: Muchun Song Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 07bf03fcf574..66d14107968d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1079,9 +1079,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } #else /* !CONFIG_KPROBES_ON_FTRACE */ -#define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) (-ENODEV) -#define disarm_kprobe_ftrace(p) (-ENODEV) +static inline int prepare_kprobe(struct kprobe *p) +{ + return arch_prepare_kprobe(p); +} + +static inline int arm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} + +static inline int disarm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} #endif /* Arm a kprobe with text_mutex */ -- cgit v1.2.3 From 820903c784a01bf6e143253418508da4f5790cff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Jul 2020 12:14:05 +0200 Subject: posix-cpu-timers: Split run_posix_cpu_timers() Split it up as a preparatory step to move the heavy lifting out of interrupt context. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20200730102337.677439437@linutronix.de --- kernel/time/posix-cpu-timers.c | 43 +++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 165117996ea0..e5ad87320468 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1080,32 +1080,15 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } -/* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. - */ -void run_posix_cpu_timers(void) +static void __run_posix_cpu_timers(struct task_struct *tsk) { - struct task_struct *tsk = current; struct k_itimer *timer, *next; unsigned long flags; LIST_HEAD(firing); - lockdep_assert_irqs_disabled(); - - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) + if (!lock_task_sighand(tsk, &flags)) return; - lockdep_posixtimer_enter(); - if (!lock_task_sighand(tsk, &flags)) { - lockdep_posixtimer_exit(); - return; - } /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1147,6 +1130,28 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(void) +{ + struct task_struct *tsk = current; + + lockdep_assert_irqs_disabled(); + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + lockdep_posixtimer_enter(); + __run_posix_cpu_timers(tsk); lockdep_posixtimer_exit(); } -- cgit v1.2.3 From 1fb497dd003009be95ce67689ac800c446b7acc5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Jul 2020 12:14:06 +0200 Subject: posix-cpu-timers: Provide mechanisms to defer timer handling to task_work Running posix CPU timers in hard interrupt context has a few downsides: - For PREEMPT_RT it cannot work as the expiry code needs to take sighand lock, which is a 'sleeping spinlock' in RT. The original RT approach of offloading the posix CPU timer handling into a high priority thread was clumsy and provided no real benefit in general. - For fine grained accounting it's just wrong to run this in context of the timer interrupt because that way a process specific CPU time is accounted to the timer interrupt. - Long running timer interrupts caused by a large amount of expiring timers which can be created and armed by unpriviledged user space. There is no hard requirement to expire them in interrupt context. If the signal is targeted at the task itself then it won't be delivered before the task returns to user space anyway. If the signal is targeted at a supervisor process then it might be slightly delayed, but posix CPU timers are inaccurate anyway due to the fact that they are tied to the tick. Provide infrastructure to schedule task work which allows splitting the posix CPU timer code into a quick check in interrupt context and a thread context expiry and signal delivery function. This has to be enabled by architectures as it requires that the architecture specific KVM implementation handles pending task work before exiting to guest mode. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20200730102337.783470146@linutronix.de --- kernel/time/Kconfig | 9 ++ kernel/time/posix-cpu-timers.c | 185 ++++++++++++++++++++++++++++++++++++++--- kernel/time/timer.c | 1 + 3 files changed, 183 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fcc42353f125..a09b1d61df6a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CMOS_UPDATE bool +# Select to handle posix CPU timers from task_work +# and not from the timer interrupt context +config HAVE_POSIX_CPU_TIMERS_TASK_WORK + bool + +config POSIX_CPU_TIMERS_TASK_WORK + bool + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e5ad87320468..a71758e34e45 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { + static struct lock_class_key posix_cpu_timers_key; struct pid *pid; rcu_read_lock(); @@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; } + /* + * If posix timer expiry is handled in task work context then + * timer::it_lock can be taken without disabling interrupts as all + * other locking happens in task context. This requires a seperate + * lock class key otherwise regular posix timer expiry would record + * the lock class being taken in interrupt context and generate a + * false positive warning. + */ + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); + new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_pid(pid); @@ -1080,26 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } -static void __run_posix_cpu_timers(struct task_struct *tsk) +static void handle_posix_cpu_timers(struct task_struct *tsk); + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +static void posix_cpu_timers_work(struct callback_head *work) +{ + handle_posix_cpu_timers(current); +} + +/* + * Initialize posix CPU timers task work in init task. Out of line to + * keep the callback static and to avoid header recursion hell. + */ +void __init posix_cputimers_init_work(void) +{ + init_task_work(¤t->posix_cputimers_work.work, + posix_cpu_timers_work); +} + +/* + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either + * in hard interrupt context or in task context with interrupts + * disabled. Aside of that the writer/reader interaction is always in the + * context of the current task, which means they are strict per CPU. + */ +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return tsk->posix_cputimers_work.scheduled; +} + +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) + return; + + /* Schedule task work to actually expire the timers */ + tsk->posix_cputimers_work.scheduled = true; + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + bool ret = true; + + /* + * On !RT kernels interrupts are disabled while collecting expired + * timers, so no tick can happen and the fast path check can be + * reenabled without further checks. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + tsk->posix_cputimers_work.scheduled = false; + return true; + } + + /* + * On RT enabled kernels ticks can happen while the expired timers + * are collected under sighand lock. But any tick which observes + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath + * checks. So reenabling the tick work has do be done carefully: + * + * Disable interrupts and run the fast path check if jiffies have + * advanced since the collecting of expired timers started. If + * jiffies have not advanced or the fast path check did not find + * newly expired timers, reenable the fast path check in the timer + * interrupt. If there are newly expired timers, return false and + * let the collection loop repeat. + */ + local_irq_disable(); + if (start != jiffies && fastpath_timer_check(tsk)) + ret = false; + else + tsk->posix_cputimers_work.scheduled = false; + local_irq_enable(); + + return ret; +} +#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_posixtimer_enter(); + handle_posix_cpu_timers(tsk); + lockdep_posixtimer_exit(); +} + +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return false; +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + return true; +} +#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ + +static void handle_posix_cpu_timers(struct task_struct *tsk) { struct k_itimer *timer, *next; - unsigned long flags; + unsigned long flags, start; LIST_HEAD(firing); if (!lock_task_sighand(tsk, &flags)) return; - /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. - */ - check_thread_timers(tsk, &firing); + do { + /* + * On RT locking sighand lock does not disable interrupts, + * so this needs to be careful vs. ticks. Store the current + * jiffies value. + */ + start = READ_ONCE(jiffies); + barrier(); - check_process_timers(tsk, &firing); + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * The above timer checks have updated the exipry cache and + * because nothing can have queued or modified timers after + * sighand lock was taken above it is guaranteed to be + * consistent. So the next timer interrupt fastpath check + * will find valid data. + * + * If timer expiry runs in the timer interrupt context then + * the loop is not relevant as timers will be directly + * expired in interrupt context. The stub function below + * returns always true which allows the compiler to + * optimize the loop out. + * + * If timer expiry is deferred to task work context then + * the following rules apply: + * + * - On !RT kernels no tick can have happened on this CPU + * after sighand lock was acquired because interrupts are + * disabled. So reenabling task work before dropping + * sighand lock and reenabling interrupts is race free. + * + * - On RT kernels ticks might have happened but the tick + * work ignored posix CPU timer handling because the + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work + * must be done very carefully including a check whether + * ticks have happened since the start of the timer + * expiry checks. posix_cpu_timers_enable_work() takes + * care of that and eventually lets the expiry checks + * run again. + */ + } while (!posix_cpu_timers_enable_work(tsk, start)); /* - * We must release these locks before taking any timer's lock. + * We must release sighand lock before taking any timer's lock. * There is a potential race with timer deletion here, as the * siglock now protects our private firing list. We have set * the firing flag in each timer, so that a deletion attempt @@ -1117,6 +1266,13 @@ static void __run_posix_cpu_timers(struct task_struct *tsk) list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; + /* + * spin_lock() is sufficient here even independent of the + * expiry context. If expiry happens in hard interrupt + * context it's obvious. For task work context it's safe + * because all other operations on timer::it_lock happen in + * task context (syscall or exit). + */ spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; @@ -1143,6 +1299,13 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); + /* + * If the actual expiry is deferred to task work context and the + * work is already scheduled there is no point to do anything here. + */ + if (posix_cpu_timers_work_scheduled(tsk)) + return; + /* * The fast path checks that there are no expired thread or thread * group timers. If that's so, just return. @@ -1150,9 +1313,7 @@ void run_posix_cpu_timers(void) if (!fastpath_timer_check(tsk)) return; - lockdep_posixtimer_enter(); __run_posix_cpu_timers(tsk); - lockdep_posixtimer_exit(); } /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ae5029f984a8..a16764b0116e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void) void __init init_timers(void) { init_timer_cpus(); + posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.2.3 From a2e9a95d2190ef55bf0724ecdf8a466d393a86b6 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 4 Aug 2020 12:49:32 +0800 Subject: kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges The crash_exclude_mem_range() function can only handle one memory region a time. It will fail in the case in which the passed in area covers several memory regions. In this case, it will only exclude the first region, then return, but leave the later regions unsolved. E.g in a NEC system with two usable RAM regions inside the low 1M: ... BIOS-e820: [mem 0x0000000000000000-0x000000000003efff] usable BIOS-e820: [mem 0x000000000003f000-0x000000000003ffff] reserved BIOS-e820: [mem 0x0000000000040000-0x000000000009ffff] usable It will only exclude the memory region [0, 0x3efff], the memory region [0x40000, 0x9ffff] will still be added into /proc/vmcore, which may cause the following failure when dumping vmcore: ioremap on RAM at 0x0000000000040000 - 0x0000000000040fff WARNING: CPU: 0 PID: 665 at arch/x86/mm/ioremap.c:186 __ioremap_caller+0x2c7/0x2e0 ... RIP: 0010:__ioremap_caller+0x2c7/0x2e0 ... cp: error reading '/proc/vmcore': Cannot allocate memory kdump: saving vmcore failed In order to fix this bug, let's extend the crash_exclude_mem_range() to handle the overlapping ranges. [ mingo: Amended the changelog. ] Signed-off-by: Lianbo Jiang Signed-off-by: Ingo Molnar Acked-by: Dave Young Link: https://lore.kernel.org/r/20200804044933.1973-3-lijiang@redhat.com --- kernel/kexec_file.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 94661d2d13ad..97fa68267197 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1157,24 +1157,26 @@ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { int i, j; - unsigned long long start, end; + unsigned long long start, end, p_start, p_end; struct crash_mem_range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; end = mem->ranges[i].end; + p_start = mstart; + p_end = mend; if (mstart > end || mend < start) continue; /* Truncate any area outside of range */ if (mstart < start) - mstart = start; + p_start = start; if (mend > end) - mend = end; + p_end = end; /* Found completely overlapping range */ - if (mstart == start && mend == end) { + if (p_start == start && p_end == end) { mem->ranges[i].start = 0; mem->ranges[i].end = 0; if (i < mem->nr_ranges - 1) { @@ -1185,20 +1187,29 @@ int crash_exclude_mem_range(struct crash_mem *mem, mem->ranges[j].end = mem->ranges[j+1].end; } + + /* + * Continue to check if there are another overlapping ranges + * from the current position because of shifting the above + * mem ranges. + */ + i--; + mem->nr_ranges--; + continue; } mem->nr_ranges--; return 0; } - if (mstart > start && mend < end) { + if (p_start > start && p_end < end) { /* Split original range */ - mem->ranges[i].end = mstart - 1; - temp_range.start = mend + 1; + mem->ranges[i].end = p_start - 1; + temp_range.start = p_end + 1; temp_range.end = end; - } else if (mstart != start) - mem->ranges[i].end = mstart - 1; + } else if (p_start != start) + mem->ranges[i].end = p_start - 1; else - mem->ranges[i].start = mend + 1; + mem->ranges[i].start = p_end + 1; break; } @@ -1243,7 +1254,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel + * memory will be mapped in two elf headers. One will contain kernel * text virtual addresses and other will have __va(physical) addresses. */ @@ -1270,7 +1281,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - /* Prepare one phdr of type PT_NOTE for each present cpu */ + /* Prepare one phdr of type PT_NOTE for each present CPU */ for_each_present_cpu(cpu) { phdr->p_type = PT_NOTE; notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); -- cgit v1.2.3 From 475f63ae63b5102ae6423d1712333929d04d6ecc Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 4 Aug 2020 12:49:33 +0800 Subject: kexec_file: Correctly output debugging information for the PT_LOAD ELF header Currently, when we enable the debugging switch to debug kexec_file, we always get the following incorrect results: kexec_file: Crash PT_LOAD elf header. phdr=00000000c988639b vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=51 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=000000003cca69a0 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=52 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000c584cb9f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=53 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000cf85d57f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=54 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000a4a8f847 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=55 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000272ec49f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=56 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000ea0b65de vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=57 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=000000001f5e490c vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=58 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000dfe4109e vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=59 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000480ed2b6 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=60 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=0000000080b65151 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=61 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=0000000024e31c5e vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=62 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000332e0385 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=63 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=000000002754d5da vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=64 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000783320dd vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=65 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=0000000076fe5b64 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=66 p_offset=0x0 The reason is that kernel always prints the values of the next PT_LOAD instead of the current PT_LOAD. Change it to ensure that we can get the correct debugging information. [ mingo: Amended changelog, capitalized "ELF". ] Signed-off-by: Lianbo Jiang Signed-off-by: Ingo Molnar Acked-by: Dave Young Link: https://lore.kernel.org/r/20200804044933.1973-4-lijiang@redhat.com --- kernel/kexec_file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 97fa68267197..3f7867c1820f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1246,7 +1246,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, unsigned long long notes_addr; unsigned long mstart, mend; - /* extra phdr for vmcoreinfo elf note */ + /* extra phdr for vmcoreinfo ELF note */ nr_phdr = nr_cpus + 1; nr_phdr += mem->nr_ranges; @@ -1254,7 +1254,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel + * memory will be mapped in two ELF headers. One will contain kernel * text virtual addresses and other will have __va(physical) addresses. */ @@ -1323,10 +1323,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; - phdr++; - pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, ehdr->e_phnum, phdr->p_offset); + phdr++; } *addr = buf; -- cgit v1.2.3 From 5e7b30205cef80f6bb922e61834437ca7bff5837 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 4 Aug 2020 22:50:56 -0700 Subject: bpf: Change uapi for bpf iterator map elements Commit a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") added bpf iterator support for map elements. The map element bpf iterator requires info to identify a particular map. In the above commit, the attr->link_create.target_fd is used to carry map_fd and an enum bpf_iter_link_info is added to uapi to specify the target_fd actually representing a map_fd: enum bpf_iter_link_info { BPF_ITER_LINK_UNSPEC = 0, BPF_ITER_LINK_MAP_FD = 1, MAX_BPF_ITER_LINK_INFO, }; This is an extensible approach as we can grow enumerator for pid, cgroup_id, etc. and we can unionize target_fd for pid, cgroup_id, etc. But in the future, there are chances that more complex customization may happen, e.g., for tasks, it could be filtered based on both cgroup_id and user_id. This patch changed the uapi to have fields __aligned_u64 iter_info; __u32 iter_info_len; for additional iter_info for link_create. The iter_info is defined as union bpf_iter_link_info { struct { __u32 map_fd; } map; }; So future extension for additional customization will be easier. The bpf_iter_link_info will be passed to target callback to validate and generic bpf_iter framework does not need to deal it any more. Note that map_fd = 0 will be considered invalid and -EBADF will be returned to user space. Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 58 +++++++++++++++++++++++++-------------------------- kernel/bpf/map_iter.c | 37 +++++++++++++++++++++++++------- kernel/bpf/syscall.c | 2 +- 3 files changed, 59 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 363b9cafc2d8..b6715964b685 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -338,8 +338,8 @@ static void bpf_iter_link_release(struct bpf_link *link) struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); - if (iter_link->aux.map) - bpf_map_put_with_uref(iter_link->aux.map); + if (iter_link->tinfo->reg_info->detach_target) + iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -390,15 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + union bpf_iter_link_info __user *ulinfo; struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; - struct bpf_iter_aux_info aux = {}; + union bpf_iter_link_info linfo; struct bpf_iter_link *link; - u32 prog_btf_id, target_fd; + u32 prog_btf_id, linfo_len; bool existed = false; - struct bpf_map *map; int err; + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + memset(&linfo, 0, sizeof(union bpf_iter_link_info)); + + ulinfo = u64_to_user_ptr(attr->link_create.iter_info); + linfo_len = attr->link_create.iter_info_len; + if (!ulinfo ^ !linfo_len) + return -EINVAL; + + if (ulinfo) { + err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), + linfo_len); + if (err) + return err; + linfo_len = min_t(u32, linfo_len, sizeof(linfo)); + if (copy_from_user(&linfo, ulinfo, linfo_len)) + return -EFAULT; + } + prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -411,13 +431,6 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; - /* Make sure user supplied flags are target expected. */ - target_fd = attr->link_create.target_fd; - if (attr->link_create.flags != tinfo->reg_info->req_linfo) - return -EINVAL; - if (!attr->link_create.flags && target_fd) - return -EINVAL; - link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -431,28 +444,15 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } - if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { - map = bpf_map_get_with_uref(target_fd); - if (IS_ERR(map)) { - err = PTR_ERR(map); - goto cleanup_link; - } - - aux.map = map; - err = tinfo->reg_info->check_target(prog, &aux); + if (tinfo->reg_info->attach_target) { + err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); if (err) { - bpf_map_put_with_uref(map); - goto cleanup_link; + bpf_link_cleanup(&link_primer); + return err; } - - link->aux.map = map; } return bpf_link_settle(&link_primer); - -cleanup_link: - bpf_link_cleanup(&link_primer); - return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index fbe1f557cb88..af86048e5afd 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,12 +98,21 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { u32 key_acc_size, value_acc_size, key_size, value_size; - struct bpf_map *map = aux->map; + struct bpf_map *map; bool is_percpu = false; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -112,7 +121,7 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) - return -EINVAL; + goto put_map; key_acc_size = prog->aux->max_rdonly_access; value_acc_size = prog->aux->max_rdwr_access; @@ -122,10 +131,22 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else value_size = round_up(map->value_size, 8) * num_possible_cpus(); - if (key_acc_size > key_size || value_acc_size > value_size) - return -EACCES; + if (key_acc_size > key_size || value_acc_size > value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, @@ -133,8 +154,8 @@ DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, static const struct bpf_iter_reg bpf_map_elem_reg_info = { .target = "bpf_map_elem", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2f343ce15747..86299a292214 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3883,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog * return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len static int link_create(union bpf_attr *attr) { enum bpf_prog_type ptype; -- cgit v1.2.3 From 0d360d64b01231cdb36e1936de889f308fd9317f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 6 Aug 2020 11:26:12 -0700 Subject: bpf: Remove inline from bpf_do_trace_printk I get the following error during compilation on my side: kernel/trace/bpf_trace.c: In function 'bpf_do_trace_printk': kernel/trace/bpf_trace.c:386:34: error: function 'bpf_do_trace_printk' can never be inlined because it uses variable argument lists static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) ^ Fixes: ac5a72ea5c89 ("bpf: Use dedicated bpf_trace_printk event instead of trace_printk()") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200806182612.1390883-1-sdf@google.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cb91ef902cc4..a8d4f253ed77 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -383,7 +383,7 @@ static DEFINE_RAW_SPINLOCK(trace_printk_lock); #define BPF_TRACE_PRINTK_SIZE 1024 -static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) { static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; -- cgit v1.2.3 From b8c1a3090741f349322ad855d2b66d6e9752a60d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 Aug 2020 20:31:41 -0700 Subject: bpf: Delete repeated words in comments Drop repeated words in kernel/bpf/: {has, the} Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200807033141.10437-1-rdunlap@infradead.org --- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bde93344164d..ed0b3578867c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1966,7 +1966,7 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array, * @index: the index of the program to replace * * Skips over dummy programs, by not counting them, when calculating - * the the position of the program to replace. + * the position of the program to replace. * * Return: * * 0 - Success diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b6ccfce3bf4c..ef938f17b944 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8294,7 +8294,7 @@ static bool stacksafe(struct bpf_func_state *old, if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> + * this stack slot, but current has STACK_MISC -> * this verifier states are not equivalent, * return false to continue verification of this path */ -- cgit v1.2.3 From 11990a5bd7e558e9203c1070fc52fb6f0488e75b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 6 Aug 2020 14:15:23 -0700 Subject: module: Correctly truncate sysfs sections output The only-root-readable /sys/module/$module/sections/$section files did not truncate their output to the available buffer size. While most paths into the kernfs read handlers end up using PAGE_SIZE buffers, it's possible to get there through other paths (e.g. splice, sendfile). Actually limit the output to the "count" passed into the read function, and report it back correctly. *sigh* Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/20200805002015.GE23458@shao2-debian Fixes: ed66f991bb19 ("module: Refactor section attr into bin attribute") Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Acked-by: Jessica Yu Signed-off-by: Kees Cook --- kernel/module.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index aa183c9ac0a2..08c46084d8cc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1520,18 +1520,34 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; +#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) static ssize_t module_sect_read(struct file *file, struct kobject *kobj, struct bin_attribute *battr, char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = container_of(battr, struct module_sect_attr, battr); + char bounce[MODULE_SECT_READ_SIZE + 1]; + size_t wrote; if (pos != 0) return -EINVAL; - return sprintf(buf, "0x%px\n", - kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); + /* + * Since we're a binary read handler, we must account for the + * trailing NUL byte that sprintf will write: if "buf" is + * too small to hold the NUL, or the NUL is exactly the last + * byte, the read will look like it got truncated by one byte. + * Since there is no way to ask sprintf nicely to not write + * the NUL, we have to use a bounce buffer. + */ + wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", + kallsyms_show_value(file->f_cred) + ? (void *)sattr->address : NULL); + count = min(count, wrote); + memcpy(buf, bounce, count); + + return count; } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1580,7 +1596,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) goto out; sect_attrs->nsections++; sattr->battr.read = module_sect_read; - sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.size = MODULE_SECT_READ_SIZE; sattr->battr.attr.mode = 0400; *(gattr++) = &(sattr++)->battr; } -- cgit v1.2.3 From 38cf307c1f2011d413750c5acb725456f47d9172 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2020 23:17:16 -0700 Subject: mm: fix kthread_use_mm() vs TLB invalidate For SMP systems using IPI based TLB invalidation, looking at current->active_mm is entirely reasonable. This then presents the following race condition: CPU0 CPU1 flush_tlb_mm(mm) use_mm(mm) tsk->active_mm = mm; if (tsk->active_mm == mm) // flush TLBs switch_mm(old_mm,mm,tsk); Where it is possible the IPI flushed the TLBs for @old_mm, not @mm, because the IPI lands before we actually switched. Avoid this by disabling IRQs across changing ->active_mm and switch_mm(). Of the (SMP) architectures that have IPI based TLB invalidate: Alpha - checks active_mm ARC - ASID specific IA64 - checks active_mm MIPS - ASID specific flush OpenRISC - shoots down world PARISC - shoots down world SH - ASID specific SPARC - ASID specific x86 - N/A xtensa - checks active_mm So at the very least Alpha, IA64 and Xtensa are suspect. On top of this, for scheduler consistency we need at least preemption disabled across changing tsk->mm and doing switch_mm(), which is currently provided by task_lock(), but that's not sufficient for PREEMPT_RT. [akpm@linux-foundation.org: add comment] Reported-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Cc: Nicholas Piggin Cc: Jens Axboe Cc: Kees Cook Cc: Jann Horn Cc: Will Deacon Cc: Christoph Hellwig Cc: Mathieu Desnoyers Cc: Link: http://lkml.kernel.org/r/20200721154106.GE10769@hirez.programming.kicks-ass.net Signed-off-by: Linus Torvalds --- kernel/kthread.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 1d9e2fdfd67a..1c8964feeb01 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1241,13 +1241,16 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(tsk->mm); task_lock(tsk); + /* Hold off tlb flush IPIs while switching mm's */ + local_irq_disable(); active_mm = tsk->active_mm; if (active_mm != mm) { mmgrab(mm); tsk->active_mm = mm; } tsk->mm = mm; - switch_mm(active_mm, mm, tsk); + switch_mm_irqs_off(active_mm, mm, tsk); + local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); @@ -1276,9 +1279,11 @@ void kthread_unuse_mm(struct mm_struct *mm) task_lock(tsk); sync_mm_rss(mm); + local_irq_disable(); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); + local_irq_enable(); task_unlock(tsk); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); -- cgit v1.2.3 From 4ca1085c9573ea08767521dabce62456e3fc2fd0 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Thu, 6 Aug 2020 23:17:19 -0700 Subject: kthread: remove incorrect comment in kthread_create_on_cpu() Originally kthread_create_on_cpu() parked and woke up the new thread. However, since commit a65d40961dc7 ("kthread/smpboot: do not park in kthread_create_on_cpu()") this is no longer the case. This patch removes the comment that has been left behind and is now incorrect / stale. Fixes: a65d40961dc7 ("kthread/smpboot: do not park in kthread_create_on_cpu()") Signed-off-by: Ilias Stamatis Signed-off-by: Andrew Morton Reviewed-by: Petr Mladek Link: http://lkml.kernel.org/r/20200611135920.240551-1-stamatis.iliass@gmail.com Signed-off-by: Linus Torvalds --- kernel/kthread.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c8964feeb01..b2807e7be772 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -480,7 +480,6 @@ EXPORT_SYMBOL(kthread_bind); * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread - * The thread will be woken and put into park mode. */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, -- cgit v1.2.3 From d42f3245c7e299e017213fa028c319316bcdb7f4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 6 Aug 2020 23:20:39 -0700 Subject: mm: memcg: convert vmstat slab counters to bytes In order to prepare for per-object slab memory accounting, convert NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE vmstat items to bytes. To make it obvious, rename them to NR_SLAB_RECLAIMABLE_B and NR_SLAB_UNRECLAIMABLE_B (similar to NR_KERNEL_STACK_KB). Internally global and per-node counters are stored in pages, however memcg and lruvec counters are stored in bytes. This scheme may look weird, but only for now. As soon as slab pages will be shared between multiple cgroups, global and node counters will reflect the total number of slab pages. However memcg and lruvec counters will be used for per-memcg slab memory tracking, which will take separate kernel objects in the account. Keeping global and node counters in pages helps to avoid additional overhead. The size of slab memory shouldn't exceed 4Gb on 32-bit machines, so it will fit into atomic_long_t we use for vmstats. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Michal Hocko Cc: Tejun Heo Link: http://lkml.kernel.org/r/20200623174037.3951353-4-guro@fb.com Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cef154261fe2..d25749bce7cf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1663,7 +1663,7 @@ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_node_page_state(NR_SLAB_RECLAIMABLE) + size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) -- cgit v1.2.3 From 991e7673859ed41e7ba83c8c4e57afe8cfebe314 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 6 Aug 2020 23:21:37 -0700 Subject: mm: memcontrol: account kernel stack per node Currently the kernel stack is being accounted per-zone. There is no need to do that. In addition due to being per-zone, memcg has to keep a separate MEMCG_KERNEL_STACK_KB. Make the stat per-node and deprecate MEMCG_KERNEL_STACK_KB as memcg_stat_item is an extension of node_stat_item. In addition localize the kernel stack stats updates to account_kernel_stack(). Signed-off-by: Shakeel Butt Signed-off-by: Andrew Morton Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200630161539.1759185-1-shakeelb@google.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 51 ++++++++++++++------------------------------------- kernel/scs.c | 2 +- 2 files changed, 15 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 76d3f3387554..c7b4ce9d2647 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk) if (vm) { int i; - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - mod_memcg_page_state(vm->pages[i], - MEMCG_KERNEL_STACK_KB, - -(int)(PAGE_SIZE / 1024)); - + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); - } for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], @@ -382,31 +377,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account) void *stack = task_stack_page(tsk); struct vm_struct *vm = task_stack_vm_area(tsk); - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); - - if (vm) { - int i; - - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - mod_zone_page_state(page_zone(vm->pages[i]), - NR_KERNEL_STACK_KB, - PAGE_SIZE / 1024 * account); - } - } else { - /* - * All stack pages are in the same zone and belong to the - * same memcg. - */ - struct page *first_page = virt_to_page(stack); - - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); - - mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); - } + /* All stack pages are in the same node. */ + if (vm) + mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + else + mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } static int memcg_charge_kernel_stack(struct task_struct *tsk) @@ -415,24 +393,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) struct vm_struct *vm = task_stack_vm_area(tsk); int ret; + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + if (vm) { int i; + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* * If memcg_kmem_charge_page() fails, page->mem_cgroup - * pointer is NULL, and both memcg_kmem_uncharge_page() - * and mod_memcg_page_state() in free_thread_stack() - * will ignore this page. So it's safe. + * pointer is NULL, and memcg_kmem_uncharge_page() in + * free_thread_stack() will ignore this page. */ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) return ret; - - mod_memcg_page_state(vm->pages[i], - MEMCG_KERNEL_STACK_KB, - PAGE_SIZE / 1024); } } #endif diff --git a/kernel/scs.c b/kernel/scs.c index 5d4d9bbdec36..4ff4a7ba0094 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -17,7 +17,7 @@ static void __scs_account(void *s, int account) { struct page *scs_page = virt_to_page(s); - mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB, account * (SCS_SIZE / SZ_1K)); } -- cgit v1.2.3 From 56f3547bfa4d361148aa748ccb86073bc57f5e6c Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 6 Aug 2020 23:23:15 -0700 Subject: mm: adjust vm_committed_as_batch according to vm overcommit policy When checking a performance change for will-it-scale scalability mmap test [1], we found very high lock contention for spinlock of percpu counter 'vm_committed_as': 94.14% 0.35% [kernel.kallsyms] [k] _raw_spin_lock_irqsave 48.21% _raw_spin_lock_irqsave;percpu_counter_add_batch;__vm_enough_memory;mmap_region;do_mmap; 45.91% _raw_spin_lock_irqsave;percpu_counter_add_batch;__do_munmap; Actually this heavy lock contention is not always necessary. The 'vm_committed_as' needs to be very precise when the strict OVERCOMMIT_NEVER policy is set, which requires a rather small batch number for the percpu counter. So keep 'batch' number unchanged for strict OVERCOMMIT_NEVER policy, and lift it to 64X for OVERCOMMIT_ALWAYS and OVERCOMMIT_GUESS policies. Also add a sysctl handler to adjust it when the policy is reconfigured. Benchmark with the same testcase in [1] shows 53% improvement on a 8C/16T desktop, and 2097%(20X) on a 4S/72C/144T server. We tested with test platforms in 0day (server, desktop and laptop), and 80%+ platforms shows improvements with that test. And whether it shows improvements depends on if the test mmap size is bigger than the batch number computed. And if the lift is 16X, 1/3 of the platforms will show improvements, though it should help the mmap/unmap usage generally, as Michal Hocko mentioned: : I believe that there are non-synthetic worklaods which would benefit from : a larger batch. E.g. large in memory databases which do large mmaps : during startups from multiple threads. [1] https://lore.kernel.org/lkml/20200305062138.GI5972@shao2-debian/ Signed-off-by: Feng Tang Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Mel Gorman Cc: Qian Cai Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Dave Hansen Cc: Huang Ying Cc: Christoph Lameter Cc: Dennis Zhou Cc: Haiyang Zhang Cc: kernel test robot Cc: "K. Y. Srinivasan" Cc: Tejun Heo Link: http://lkml.kernel.org/r/1589611660-89854-4-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1592725000-73486-4-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1594389708-60781-5-git-send-email-feng.tang@intel.com Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1b4d2dc270a5..f785de3caac0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, -- cgit v1.2.3 From 26e760c9a7c8ec31fa1a6bfbbce3f63f189ccef0 Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Thu, 6 Aug 2020 23:24:35 -0700 Subject: rcu: kasan: record and print call_rcu() call stack Patch series "kasan: memorize and print call_rcu stack", v8. This patchset improves KASAN reports by making them to have call_rcu() call stack information. It is useful for programmers to solve use-after-free or double-free memory issue. The KASAN report was as follows(cleaned up slightly): BUG: KASAN: use-after-free in kasan_rcu_reclaim+0x58/0x60 Freed by task 0: kasan_save_stack+0x24/0x50 kasan_set_track+0x24/0x38 kasan_set_free_info+0x18/0x20 __kasan_slab_free+0x10c/0x170 kasan_slab_free+0x10/0x18 kfree+0x98/0x270 kasan_rcu_reclaim+0x1c/0x60 Last call_rcu(): kasan_save_stack+0x24/0x50 kasan_record_aux_stack+0xbc/0xd0 call_rcu+0x8c/0x580 kasan_rcu_uaf+0xf4/0xf8 Generic KASAN will record the last two call_rcu() call stacks and print up to 2 call_rcu() call stacks in KASAN report. it is only suitable for generic KASAN. This feature considers the size of struct kasan_alloc_meta and kasan_free_meta, we try to optimize the structure layout and size, lets it get better memory consumption. [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 [2]https://groups.google.com/forum/#!searchin/kasan-dev/better$20stack$20traces$20for$20rcu%7Csort:date/kasan-dev/KQsjT_88hDE/7rNUZprRBgAJ This patch (of 4): This feature will record the last two call_rcu() call stacks and prints up to 2 call_rcu() call stacks in KASAN report. When call_rcu() is called, we store the call_rcu() call stack into slub alloc meta-data, so that the KASAN report can print rcu stack. [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 [2]https://groups.google.com/forum/#!searchin/kasan-dev/better$20stack$20traces$20for$20rcu%7Csort:date/kasan-dev/KQsjT_88hDE/7rNUZprRBgAJ [walter-zh.wu@mediatek.com: build fix] Link: http://lkml.kernel.org/r/20200710162401.23816-1-walter-zh.wu@mediatek.com Suggested-by: Dmitry Vyukov Signed-off-by: Walter Wu Signed-off-by: Andrew Morton Tested-by: Dmitry Vyukov Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Acked-by: Paul E. McKenney Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Josh Triplett Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Jonathan Corbet Cc: Matthias Brugger Link: http://lkml.kernel.org/r/20200710162123.23713-1-walter-zh.wu@mediatek.com Link: http://lkml.kernel.org/r/20200601050847.1096-1-walter-zh.wu@mediatek.com Link: http://lkml.kernel.org/r/20200601050927.1153-1-walter-zh.wu@mediatek.com Signed-off-by: Linus Torvalds --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ac7198ed3197..8ce77d9ac716 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); + kasan_record_aux_stack(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ -- cgit v1.2.3 From 8dcc1d34661d58a7889fb06517c8738d1412d1bc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 6 Aug 2020 23:24:57 -0700 Subject: kasan: don't tag stacks allocated with pagealloc Patch series "kasan: support stack instrumentation for tag-based mode", v2. This patch (of 5): Prepare Software Tag-Based KASAN for stack tagging support. With Tag-Based KASAN when kernel stacks are allocated via pagealloc (which happens when CONFIG_VMAP_STACK is not enabled), they get tagged. KASAN instrumentation doesn't expect the sp register to be tagged, and this leads to false-positive reports. Fix by resetting the tag of kernel stack pointers after allocation. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Marco Elver Cc: Walter Wu Cc: Elena Petrova Cc: Vincenzo Frascino Cc: Catalin Marinas Cc: Ard Biesheuvel Link: http://lkml.kernel.org/r/cover.1596199677.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/cover.1596544734.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/12d8c678869268dd0884b01271ab592f30792abf.1596544734.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/01c678b877755bcf29009176592402cdf6f2cb15.1596199677.git.andreyknvl@google.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=203497 Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c7b4ce9d2647..35e9894d394c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) THREAD_SIZE_ORDER); if (likely(page)) { - tsk->stack = page_address(page); + tsk->stack = kasan_reset_tag(page_address(page)); return tsk->stack; } return NULL; @@ -302,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); + stack = kasan_reset_tag(stack); tsk->stack = stack; return stack; } -- cgit v1.2.3 From 38ce2a9e33db61a3041840310077072d6210ead4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 6 Aug 2020 12:46:49 -0400 Subject: tracing: Add trace_array_init_printk() to initialize instance trace_printk() buffers As trace_array_printk() used with not global instances will not add noise to the main buffer, they are OK to have in the kernel (unlike trace_printk()). This require the subsystem to create their own tracing instance, and the trace_array_printk() only writes into those instances. Add trace_array_init_printk() to initialize the trace_printk() buffers without printing out the WARNING message. Reported-by: Sean Paul Reviewed-by: Sean Paul Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06c0feae5ff9..c5f822736261 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3129,6 +3129,9 @@ static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; + if (trace_percpu_buffer) + return 0; + buffers = alloc_percpu(struct trace_buffer_struct); if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) return -ENOMEM; @@ -3331,6 +3334,26 @@ int trace_array_vprintk(struct trace_array *tr, return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } +/** + * trace_array_printk - Print a message to a specific instance + * @tr: The instance trace_array descriptor + * @ip: The instruction pointer that this is called from. + * @fmt: The format to print (printf format) + * + * If a subsystem sets up its own instance, they have the right to + * printk strings into their tracing instance buffer using this + * function. Note, this function will not write into the top level + * buffer (use trace_printk() for that), as writing into the top level + * buffer should only have events that can be individually disabled. + * trace_printk() is only used for debugging a kernel, and should not + * be ever encorporated in normal use. + * + * trace_array_printk() can be used, as it will not add noise to the + * top level tracing buffer. + * + * Note, trace_array_init_printk() must be called on @tr before this + * can be used. + */ __printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) @@ -3355,6 +3378,27 @@ int trace_array_printk(struct trace_array *tr, } EXPORT_SYMBOL_GPL(trace_array_printk); +/** + * trace_array_init_printk - Initialize buffers for trace_array_printk() + * @tr: The trace array to initialize the buffers for + * + * As trace_array_printk() only writes into instances, they are OK to + * have in the kernel (unlike trace_printk()). This needs to be called + * before trace_array_printk() can be used on a trace_array. + */ +int trace_array_init_printk(struct trace_array *tr) +{ + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr == &global_trace) + return -EINVAL; + + return alloc_percpu_trace_buffer(); +} +EXPORT_SYMBOL_GPL(trace_array_init_printk); + __printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) -- cgit v1.2.3 From 15d5761ad31dfb194ebe76554e6af0437eb20424 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 7 Jul 2020 18:21:16 +0900 Subject: kbuild: introduce ccflags-remove-y and asflags-remove-y CFLAGS_REMOVE_.o filters out flags when compiling a particular object, but there is no convenient way to do that for every object in a directory. Add ccflags-remove-y and asflags-remove-y to make it easily. Use ccflags-remove-y to clean up some Makefiles. The add/remove order works as follows: [1] KBUILD_CFLAGS specifies compiler flags used globally [2] ccflags-y adds compiler flags for all objects in the current Makefile [3] ccflags-remove-y removes compiler flags for all objects in the current Makefile (New feature) [4] CFLAGS_ adds compiler flags per file. [5] CFLAGS_REMOVE_ removes compiler flags per file. Having [3] before [4] allows us to remove flags from most (but not all) objects in the current Makefile. For example, kernel/trace/Makefile removes $(CC_FLAGS_FTRACE) from all objects in the directory, then adds it back to trace_selftest_dynamic.o and CFLAGS_trace_kprobe_selftest.o The same applies to lib/livepatch/Makefile. Please note ccflags-remove-y has no effect to the sub-directories. In contrast, the previous notation got rid of compiler flags also from all the sub-directories. The following are not affected because they have no sub-directories: arch/arm/boot/compressed/ arch/powerpc/xmon/ arch/sh/ kernel/trace/ However, lib/ has several sub-directories. To keep the behavior, I added ccflags-remove-y to all Makefiles in subdirectories of lib/, except the following: lib/vdso/Makefile - Kbuild does not descend into this Makefile lib/raid/test/Makefile - This is not used for the kernel build I think commit 2464a609ded0 ("ftrace: do not trace library functions") excluded too much. In the next commit, I will remove ccflags-remove-y from the sub-directories of lib/. Suggested-by: Sami Tolvanen Signed-off-by: Masahiro Yamada Acked-by: Steven Rostedt (VMware) Acked-by: Michael Ellerman (powerpc) Acked-by: Brendan Higgins (KUnit) Tested-by: Anders Roxell --- kernel/trace/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6575bb0a0434..7492844a8b1b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -2,9 +2,9 @@ # Do not instrument the tracer itself: +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + ifdef CONFIG_FUNCTION_TRACER -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) # Avoid recursion due to instrumentation. KCSAN_SANITIZE := n -- cgit v1.2.3 From b0294f30256bb6023b2044fd607855123863d98f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 Aug 2020 20:32:48 -0700 Subject: time: Delete repeated words in comments Drop repeated words in kernel/time/. {when, one, into} Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/r/20200807033248.8452-1-rdunlap@infradead.org --- kernel/time/alarmtimer.c | 2 +- kernel/time/sched_clock.c | 2 +- kernel/time/timekeeping.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2ffb466af77e..ca223a89530a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) * When a alarm timer fires, this runs through the timerqueue to * see which alarms expired, and runs those. If there are more alarm * timers queued for the future, we set the hrtimer to fire when - * when the next future alarm timer expires. + * the next future alarm timer expires. */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0deaf4b79fb4..1c03eec6ca9b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, - * make it the final one one. + * make it the final one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c7212f3c603..35cd10f1143b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2001,7 +2001,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into - * into a shifted interval nanoseconds. Allows for O(log) accumulation + * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. -- cgit v1.2.3 From e27b1636e9337d1a1d174b191e53d0f86421a822 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 11 Aug 2020 11:00:01 -0700 Subject: genirq/PM: Always unlock IRQ descriptor in rearm_wake_irq() rearm_wake_irq() does not unlock the irq descriptor if the interrupt is not suspended or if wakeup is not enabled on it. Restucture the exit conditions so the unlock is always ensured. Fixes: 3a79bc63d9075 ("PCI: irq: Introduce rearm_wake_irq()") Signed-off-by: Guenter Roeck Signed-off-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200811180001.80203-1-linux@roeck-us.net --- kernel/irq/pm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 8f557fa1f4fe..c6c7e187ae74 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -185,14 +185,18 @@ void rearm_wake_irq(unsigned int irq) unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - if (!desc || !(desc->istate & IRQS_SUSPENDED) || - !irqd_is_wakeup_set(&desc->irq_data)) + if (!desc) return; + if (!(desc->istate & IRQS_SUSPENDED) || + !irqd_is_wakeup_set(&desc->irq_data)) + goto unlock; + desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); +unlock: irq_put_desc_busunlock(desc, flags); } -- cgit v1.2.3 From b518154e59aab3ad0780a169c5cc84bd4ee4357e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Aug 2020 18:30:40 -0700 Subject: mm/vmscan: protect the workingset on anonymous LRU In current implementation, newly created or swap-in anonymous page is started on active list. Growing active list results in rebalancing active/inactive list so old pages on active list are demoted to inactive list. Hence, the page on active list isn't protected at all. Following is an example of this situation. Assume that 50 hot pages on active list. Numbers denote the number of pages on active/inactive list (active | inactive). 1. 50 hot pages on active list 50(h) | 0 2. workload: 50 newly created (used-once) pages 50(uo) | 50(h) 3. workload: another 50 newly created (used-once) pages 50(uo) | 50(uo), swap-out 50(h) This patch tries to fix this issue. Like as file LRU, newly created or swap-in anonymous pages will be inserted to the inactive list. They are promoted to active list if enough reference happens. This simple modification changes the above example as following. 1. 50 hot pages on active list 50(h) | 0 2. workload: 50 newly created (used-once) pages 50(h) | 50(uo) 3. workload: another 50 newly created (used-once) pages 50(h) | 50(uo), swap-out 50(uo) As you can see, hot pages on active list would be protected. Note that, this implementation has a drawback that the page cannot be promoted and will be swapped-out if re-access interval is greater than the size of inactive list but less than the size of total(active+inactive). To solve this potential issue, following patch will apply workingset detection similar to the one that's already applied to file LRU. Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Link: http://lkml.kernel.org/r/1595490560-15117-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 25de10c904e6..49047d479c57 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - lru_cache_add_active_or_unevictable(new_page, vma); + lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); -- cgit v1.2.3 From facdaa917c4d5a376d09d25865f5a863f906234a Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Tue, 11 Aug 2020 18:31:00 -0700 Subject: mm: proactive compaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some applications, we need to allocate almost all memory as hugepages. However, on a running system, higher-order allocations can fail if the memory is fragmented. Linux kernel currently does on-demand compaction as we request more hugepages, but this style of compaction incurs very high latency. Experiments with one-time full memory compaction (followed by hugepage allocations) show that kernel is able to restore a highly fragmented memory state to a fairly compacted memory state within <1 sec for a 32G system. Such data suggests that a more proactive compaction can help us allocate a large fraction of memory as hugepages keeping allocation latencies low. For a more proactive compaction, the approach taken here is to define a new sysctl called 'vm.compaction_proactiveness' which dictates bounds for external fragmentation which kcompactd tries to maintain. The tunable takes a value in range [0, 100], with a default of 20. Note that a previous version of this patch [1] was found to introduce too many tunables (per-order extfrag{low, high}), but this one reduces them to just one sysctl. Also, the new tunable is an opaque value instead of asking for specific bounds of "external fragmentation", which would have been difficult to estimate. The internal interpretation of this opaque value allows for future fine-tuning. Currently, we use a simple translation from this tunable to [low, high] "fragmentation score" thresholds (low=100-proactiveness, high=low+10%). The score for a node is defined as weighted mean of per-zone external fragmentation. A zone's present_pages determines its weight. To periodically check per-node score, we reuse per-node kcompactd threads, which are woken up every 500 milliseconds to check the same. If a node's score exceeds its high threshold (as derived from user-provided proactiveness value), proactive compaction is started until its score reaches its low threshold value. By default, proactiveness is set to 20, which implies threshold values of low=80 and high=90. This patch is largely based on ideas from Michal Hocko [2]. See also the LWN article [3]. Performance data ================ System: x64_64, 1T RAM, 80 CPU threads. Kernel: 5.6.0-rc3 + this patch echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/defrag Before starting the driver, the system was fragmented from a userspace program that allocates all memory and then for each 2M aligned section, frees 3/4 of base pages using munmap. The workload is mainly anonymous userspace pages, which are easy to move around. I intentionally avoided unmovable pages in this test to see how much latency we incur when hugepage allocations hit direct compaction. 1. Kernel hugepage allocation latencies With the system in such a fragmented state, a kernel driver then allocates as many hugepages as possible and measures allocation latency: (all latency values are in microseconds) - With vanilla 5.6.0-rc3 percentile latency –––––––––– ––––––– 5 7894 10 9496 25 12561 30 15295 40 18244 50 21229 60 27556 75 30147 80 31047 90 32859 95 33799 Total 2M hugepages allocated = 383859 (749G worth of hugepages out of 762G total free => 98% of free memory could be allocated as hugepages) - With 5.6.0-rc3 + this patch, with proactiveness=20 sysctl -w vm.compaction_proactiveness=20 percentile latency –––––––––– ––––––– 5 2 10 2 25 3 30 3 40 3 50 4 60 4 75 4 80 4 90 5 95 429 Total 2M hugepages allocated = 384105 (750G worth of hugepages out of 762G total free => 98% of free memory could be allocated as hugepages) 2. JAVA heap allocation In this test, we first fragment memory using the same method as for (1). Then, we start a Java process with a heap size set to 700G and request the heap to be allocated with THP hugepages. We also set THP to madvise to allow hugepage backing of this heap. /usr/bin/time java -Xms700G -Xmx700G -XX:+UseTransparentHugePages -XX:+AlwaysPreTouch The above command allocates 700G of Java heap using hugepages. - With vanilla 5.6.0-rc3 17.39user 1666.48system 27:37.89elapsed - With 5.6.0-rc3 + this patch, with proactiveness=20 8.35user 194.58system 3:19.62elapsed Elapsed time remains around 3:15, as proactiveness is further increased. Note that proactive compaction happens throughout the runtime of these workloads. The situation of one-time compaction, sufficient to supply hugepages for following allocation stream, can probably happen for more extreme proactiveness values, like 80 or 90. In the above Java workload, proactiveness is set to 20. The test starts with a node's score of 80 or higher, depending on the delay between the fragmentation step and starting the benchmark, which gives more-or-less time for the initial round of compaction. As t he benchmark consumes hugepages, node's score quickly rises above the high threshold (90) and proactive compaction starts again, which brings down the score to the low threshold level (80). Repeat. bpftrace also confirms proactive compaction running 20+ times during the runtime of this Java benchmark. kcompactd threads consume 100% of one of the CPUs while it tries to bring a node's score within thresholds. Backoff behavior ================ Above workloads produce a memory state which is easy to compact. However, if memory is filled with unmovable pages, proactive compaction should essentially back off. To test this aspect: - Created a kernel driver that allocates almost all memory as hugepages followed by freeing first 3/4 of each hugepage. - Set proactiveness=40 - Note that proactive_compact_node() is deferred maximum number of times with HPAGE_FRAG_CHECK_INTERVAL_MSEC of wait between each check (=> ~30 seconds between retries). [1] https://patchwork.kernel.org/patch/11098289/ [2] https://lore.kernel.org/linux-mm/20161230131412.GI13301@dhcp22.suse.cz/ [3] https://lwn.net/Articles/817905/ Signed-off-by: Nitin Gupta Signed-off-by: Andrew Morton Tested-by: Oleksandr Natalenko Reviewed-by: Vlastimil Babka Reviewed-by: Khalid Aziz Reviewed-by: Oleksandr Natalenko Cc: Vlastimil Babka Cc: Khalid Aziz Cc: Michal Hocko Cc: Mel Gorman Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Joonsoo Kim Cc: David Rientjes Cc: Nitin Gupta Cc: Oleksandr Natalenko Link: http://lkml.kernel.org/r/20200616204527.19185-1-nigupta@nvidia.com Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f785de3caac0..ab72e52f8a7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2851,6 +2851,15 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = sysctl_compaction_handler, }, + { + .procname = "compaction_proactiveness", + .data = &sysctl_compaction_proactiveness, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, { .procname = "extfrag_threshold", .data = &sysctl_extfrag_threshold, -- cgit v1.2.3 From d34c0a7599ea8c301bc471dfa1eb2bf2db6752d1 Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Tue, 11 Aug 2020 18:31:07 -0700 Subject: mm: use unsigned types for fragmentation score Proactive compaction uses per-node/zone "fragmentation score" which is always in range [0, 100], so use unsigned type of these scores as well as for related constants. Signed-off-by: Nitin Gupta Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Vlastimil Babka Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200618010319.13159-1-nigupta@nvidia.com Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ab72e52f8a7b..287862f91717 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2854,7 +2854,7 @@ static struct ctl_table vm_table[] = { { .procname = "compaction_proactiveness", .data = &sysctl_compaction_proactiveness, - .maxlen = sizeof(int), + .maxlen = sizeof(sysctl_compaction_proactiveness), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, -- cgit v1.2.3 From 3d13f313ce4c34c524ccc37986fe77172f601ff3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:47 -0700 Subject: uaccess: add force_uaccess_{begin,end} helpers Add helpers to wrap the get_fs/set_fs magic for undoing any damange done by set_fs(KERNEL_DS). There is no real functional benefit, but this documents the intent of these calls better, and will allow stubbing the functions out easily for kernels builds that do not allow address space overrides in the future. [hch@lst.de: drop two incorrect hunks, fix a commit log typo] Link: http://lkml.kernel.org/r/20200714105505.935079-6-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Acked-by: Mark Rutland Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven Cc: Nick Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Link: http://lkml.kernel.org/r/20200710135706.537715-6-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/events/callchain.c | 5 ++--- kernel/events/core.c | 5 ++--- kernel/kthread.c | 5 ++--- kernel/stacktrace.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c6ce894e4ce9..58cbe357fb2b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - set_fs(fs); + force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index d1f0a7e5b182..6961333ebad5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - set_fs(fs); + force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); diff --git a/kernel/kthread.c b/kernel/kthread.c index b2807e7be772..3edaa380dc7b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1258,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); - to_kthread(tsk)->oldfs = get_fs(); - set_fs(USER_DS); + to_kthread(tsk)->oldfs = force_uaccess_begin(); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1274,7 +1273,7 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); - set_fs(to_kthread(tsk)->oldfs); + force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); sync_mm_rss(mm); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 2af66e449aa6..946f44a9e86a 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) if (current->flags & PF_KTHREAD) return 0; - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); - set_fs(fs); + force_uaccess_end(fs); return c.len; } -- cgit v1.2.3 From fe81417596fa8b6577fedb7e206ff3e4c7015c13 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:50 -0700 Subject: exec: use force_uaccess_begin during exec and exit Both exec and exit want to ensure that the uaccess routines actually do access user pointers. Use the newly added force_uaccess_begin helper instead of an open coded set_fs for that to prepare for kernel builds where set_fs() does not exist. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200710135706.537715-7-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e731c414e024..c2d2961576f2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -732,7 +732,7 @@ void __noreturn do_exit(long code) * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ - set_fs(USER_DS); + force_uaccess_begin(); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", -- cgit v1.2.3 From 8043fc147a97ec2eefc582487f344f2cbe86d12e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:34:10 -0700 Subject: kernel: add a kernel_wait helper Add a helper that waits for a pid and stores the status in the passed in kernel pointer. Use it to fix the usage of kernel_wait4 in call_usermodehelper_exec_sync that only happens to work due to the implicit set_fs(KERNEL_DS) for kernel threads. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Acked-by: "Eric W. Biederman" Cc: Luis Chamberlain Link: http://lkml.kernel.org/r/20200721130449.5008-1-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/exit.c | 16 ++++++++++++++++ kernel/umh.c | 29 ++++------------------------- 2 files changed, 20 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c2d2961576f2..733e80f334e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, return ret; } +int kernel_wait(pid_t pid, int *stat) +{ + struct wait_opts wo = { + .wo_type = PIDTYPE_PID, + .wo_pid = find_get_pid(pid), + .wo_flags = WEXITED, + }; + int ret; + + ret = do_wait(&wo); + if (ret > 0 && wo.wo_stat) + *stat = wo.wo_stat; + put_pid(wo.wo_pid); + return ret; +} + SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { diff --git a/kernel/umh.c b/kernel/umh.c index a25433f9cd9a..fcf3ee803630 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -119,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; - /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ + /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); - if (pid < 0) { + if (pid < 0) sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But call_usermodehelper_exec_sync() always runs as kernel - * thread (workqueue) and put_user() to a kernel address works - * OK for kernel threads, due to their having an mm_segment_t - * which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - kernel_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either call_usermodehelper_exec_async failed and - * the real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } + else + kernel_wait(pid, &sub_info->retval); /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); - umh_complete(sub_info); } -- cgit v1.2.3 From 6f9e148c218641ad876845abdea4b0597468c55e Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 11 Aug 2020 18:36:12 -0700 Subject: kmod: remove redundant "be an" in the comment There exists redundant "be an" in the comment, remove it. Signed-off-by: Tiezhu Yang Signed-off-by: Luis Chamberlain Signed-off-by: Andrew Morton Acked-by: Luis Chamberlain Cc: Alexei Starovoitov Cc: Al Viro Cc: Christian Brauner Cc: Chuck Lever Cc: David Howells Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Jakub Kicinski Cc: James Morris Cc: Jarkko Sakkinen Cc: J. Bruce Fields Cc: Jens Axboe Cc: Josh Triplett Cc: Kees Cook Cc: Lars Ellenberg Cc: Nikolay Aleksandrov Cc: Philipp Reisner Cc: Roopa Prabhu Cc: "Serge E. Hallyn" Cc: Sergei Trofimovich Cc: Sergey Kvachonok Cc: Shuah Khan Cc: Tony Vroon Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200610154923.27510-3-mcgrof@kernel.org Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 37c3c4b97b8e..3cd075ce2a1e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,9 +36,8 @@ * * If you need less than 50 threads would mean we're dealing with systems * smaller than 3200 pages. This assumes you are capable of having ~13M memory, - * and this would only be an be an upper limit, after which the OOM killer - * would take effect. Systems like these are very unlikely if modules are - * enabled. + * and this would only be an upper limit, after which the OOM killer would take + * effect. Systems like these are very unlikely if modules are enabled. */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); -- cgit v1.2.3 From 0935288c6e008c0682ab6171fb5605dcc049e7bd Mon Sep 17 00:00:00 2001 From: Vijay Balakrishna Date: Tue, 11 Aug 2020 18:36:33 -0700 Subject: kdump: append kernel build-id string to VMCOREINFO Make kernel GNU build-id available in VMCOREINFO. Having build-id in VMCOREINFO facilitates presenting appropriate kernel namelist image with debug information file to kernel crash dump analysis tools. Currently VMCOREINFO lacks uniquely identifiable key for crash analysis automation. Regarding if this patch is necessary or matching of linux_banner and OSRELEASE in VMCOREINFO employed by crash(8) meets the need -- IMO, build-id approach more foolproof, in most instances it is a cryptographic hash generated using internal code/ELF bits unlike kernel version string upon which linux_banner is based that is external to the code. I feel each is intended for a different purpose. Also OSRELEASE is not suitable when two different kernel builds from same version with different features enabled. Currently for most linux (and non-linux) systems build-id can be extracted using standard methods for file types such as user mode crash dumps, shared libraries, loadable kernel modules etc., This is an exception for linux kernel dump. Having build-id in VMCOREINFO brings some uniformity for automation tools. Tyler said: : I think this is a nice improvement over today's linux_banner approach for : correlating vmlinux to a kernel dump. : : The elf notes parsing in this patch lines up with what is described in in : the "Notes (Nhdr)" section of the elf(5) man page. : : BUILD_ID_MAX is sufficient to hold a sha1 build-id, which is the default : build-id type today in GNU ld(2). It is also sufficient to hold the : "fast" build-id, which is the default build-id type today in LLVM lld(2). Signed-off-by: Vijay Balakrishna Signed-off-by: Andrew Morton Reviewed-by: Tyler Hicks Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Link: http://lkml.kernel.org/r/1591849672-34104-1-git-send-email-vijayb@linux.microsoft.com Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 18175687133a..106e4500fd53 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,6 +11,8 @@ #include #include +#include + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); +#define NOTES_SIZE (&__stop_notes - &__start_notes) +#define BUILD_ID_MAX SHA1_DIGEST_SIZE +#define NT_GNU_BUILD_ID 3 + +struct elf_note_section { + struct elf_note n_hdr; + u8 n_data[]; +}; + +/* + * Add build ID from .notes section as generated by the GNU ld(1) + * or LLVM lld(1) --build-id option. + */ +static void add_build_id_vmcoreinfo(void) +{ + char build_id[BUILD_ID_MAX * 2 + 1]; + int n_remain = NOTES_SIZE; + + while (n_remain >= sizeof(struct elf_note)) { + const struct elf_note_section *note_sec = + &__start_notes + NOTES_SIZE - n_remain; + const u32 n_namesz = note_sec->n_hdr.n_namesz; + + if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID && + n_namesz != 0 && + !strcmp((char *)¬e_sec->n_data[0], "GNU")) { + if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) { + const u32 n_descsz = note_sec->n_hdr.n_descsz; + const u8 *s = ¬e_sec->n_data[n_namesz]; + + s = PTR_ALIGN(s, 4); + bin2hex(build_id, s, n_descsz); + build_id[2 * n_descsz] = '\0'; + VMCOREINFO_BUILD_ID(build_id); + return; + } + pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n", + note_sec->n_hdr.n_descsz, + BUILD_ID_MAX); + return; + } + n_remain -= sizeof(struct elf_note) + + ALIGN(note_sec->n_hdr.n_namesz, 4) + + ALIGN(note_sec->n_hdr.n_descsz, 4); + } +} + static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); @@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void) } VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + add_build_id_vmcoreinfo(); VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); -- cgit v1.2.3 From 79076e1241bb3bf02d0aac7d39120d8161fe07b1 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 11 Aug 2020 18:36:46 -0700 Subject: kernel/panic.c: make oops_may_print() return bool The return value of oops_may_print() is true or false, so change its type to reflect that. Signed-off-by: Tiezhu Yang Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Xuefeng Li Link: http://lkml.kernel.org/r/1591103358-32087-1-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index e2157ca387c8..93ba061d1c79 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -505,7 +505,7 @@ static void do_oops_enter_exit(void) * Return true if the calling CPU is allowed to print oops-related info. * This is a bit racy.. */ -int oops_may_print(void) +bool oops_may_print(void) { return pause_on_oops_flag == 0; } -- cgit v1.2.3 From 63037f74725ddd8a767ed2ad0369e60a3bf1f2ce Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Tue, 11 Aug 2020 18:36:53 -0700 Subject: panic: make print_oops_end_marker() static Since print_oops_end_marker() is not used externally, also remove it in kernel.h at the same time. Signed-off-by: Yue Hu Signed-off-by: Andrew Morton Cc: Kees Cook Link: http://lkml.kernel.org/r/20200724011516.12756-1-zbestahu@gmail.com Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 93ba061d1c79..aef8872ba843 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -551,7 +551,7 @@ static int init_oops_id(void) } late_initcall(init_oops_id); -void print_oops_end_marker(void) +static void print_oops_end_marker(void) { init_oops_id(); pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); -- cgit v1.2.3 From 31a1b9878c0667945b078c92bce907552f57f2b6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Aug 2020 18:36:56 -0700 Subject: kcov: unconditionally add -fno-stack-protector to compiler options Unconditionally add -fno-stack-protector to KCOV's compiler options, as all supported compilers support the option. This saves a compiler invocation to determine if the option is supported. Because Clang does not support -fno-conserve-stack, and -fno-stack-protector was wrapped in the same cc-option, we were missing -fno-stack-protector with Clang. Unconditionally adding this option fixes this for Clang. Suggested-by: Nick Desaulniers Signed-off-by: Marco Elver Signed-off-by: Andrew Morton Reviewed-by: Nick Desaulniers Reviewed-by: Andrey Konovalov Cc: Dmitry Vyukov Cc: Alexander Potapenko Link: http://lkml.kernel.org/r/20200615184302.7591-1-elver@google.com Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 5350fd292910..b3da548691c9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -36,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n -CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) -- cgit v1.2.3 From fed79d057d0842d2e381632b783af30745dd7938 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 11 Aug 2020 18:36:59 -0700 Subject: kcov: make some symbols static Fix sparse build warnings: kernel/kcov.c:99:1: warning: symbol '__pcpu_scope_kcov_percpu_data' was not declared. Should it be static? kernel/kcov.c:778:6: warning: symbol 'kcov_remote_softirq_start' was not declared. Should it be static? kernel/kcov.c:795:6: warning: symbol 'kcov_remote_softirq_stop' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Reviewed-by: Andrey Konovalov Link: http://lkml.kernel.org/r/20200702115501.73077-1-weiyongjun1@huawei.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 6afae0bcbac4..6b8368be89c8 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -96,7 +96,7 @@ struct kcov_percpu_data { int saved_sequence; }; -DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); +static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) @@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode) return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; } -void kcov_remote_softirq_start(struct task_struct *t) +static void kcov_remote_softirq_start(struct task_struct *t) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); unsigned int mode; @@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t) } } -void kcov_remote_softirq_stop(struct task_struct *t) +static void kcov_remote_softirq_stop(struct task_struct *t) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); -- cgit v1.2.3 From 64019a2e467a288a16b65ab55ddcbf58c1b00187 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:39:01 -0700 Subject: mm/gup: remove task_struct pointer for all gup code After the cleanup of page fault accounting, gup does not need to pass task_struct around any more. Remove that parameter in the whole gup stack. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Link: http://lkml.kernel.org/r/20200707225021.200906-26-peterx@redhat.com Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 6 +++--- kernel/futex.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 49047d479c57..649fd53dc9ad 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) if (!vaddr || !d) return -EINVAL; - ret = get_user_pages_remote(NULL, mm, vaddr, 1, + ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, &vma, NULL); if (unlikely(ret <= 0)) { /* @@ -477,7 +477,7 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL, NULL); if (result < 0) return result; diff --git a/kernel/futex.c b/kernel/futex.c index 83404124b77b..61e8153e6c76 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; mmap_read_lock(mm); - ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); -- cgit v1.2.3 From b33164f2bd1cedb094c32cb466287116164457ae Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 12 Aug 2020 14:31:02 +0200 Subject: bpf: Iterate through all PT_NOTE sections when looking for build id Currently when we look for build id within bpf_get_stackid helper call, we check the first NOTE section and we fail if build id is not there. However on some system (Fedora) there can be multiple NOTE sections in binaries and build id data is not always the first one, like: $ readelf -a /usr/bin/ls ... [ 2] .note.gnu.propert NOTE 0000000000000338 00000338 0000000000000020 0000000000000000 A 0 0 8358 [ 3] .note.gnu.build-i NOTE 0000000000000358 00000358 0000000000000024 0000000000000000 A 0 0 437c [ 4] .note.ABI-tag NOTE 000000000000037c 0000037c ... So the stack_map_get_build_id function will fail on build id retrieval and fallback to BPF_STACK_BUILD_ID_IP. This patch is changing the stack_map_get_build_id code to iterate through all the NOTE sections and try to get build id data from each of them. When tracing on sched_switch tracepoint that does bpf_get_stackid helper call kernel build, I can see about 60% increase of successful build id retrieval. The rest seems fails on -EFAULT. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200812123102.20032-1-jolsa@kernel.org --- kernel/bpf/stackmap.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4fd830a62be2..cfed0ac44d38 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -213,11 +213,13 @@ static int stack_map_get_build_id_32(void *page_addr, phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } @@ -236,11 +238,13 @@ static int stack_map_get_build_id_64(void *page_addr, phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } -- cgit v1.2.3 From f107cee94ba4d2c7357fde59a1d84346c73d4958 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 11 Aug 2020 11:00:12 -0700 Subject: genirq: Unlock irq descriptor after errors In irq_set_irqchip_state(), the irq descriptor is not unlocked after an error is encountered. While that should never happen in practice, a buggy driver may trigger it. This would result in a lockup, so fix it. Fixes: 1d0326f352bb ("genirq: Check irq_data_get_irq_chip() return value before use") Signed-off-by: Guenter Roeck Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200811180012.80269-1-linux@roeck-us.net --- kernel/irq/manage.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d55ba625d426..52ac5391dcc6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2731,8 +2731,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); - if (WARN_ON_ONCE(!chip)) - return -ENODEV; + if (WARN_ON_ONCE(!chip)) { + err = -ENODEV; + goto out_unlock; + } if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2745,6 +2747,7 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, if (data) err = chip->irq_set_irqchip_state(data, which, val); +out_unlock: irq_put_desc_busunlock(desc, flags); return err; } -- cgit v1.2.3 From ebf0d100df0731901c16632f78d78d35f4123bc4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Aug 2020 09:01:38 -0600 Subject: task_work: only grab task signal lock when needed If JOBCTL_TASK_WORK is already set on the targeted task, then we need not go through {lock,unlock}_task_sighand() to set it again and queue a signal wakeup. This is safe as we're checking it _after_ adding the new task_work with cmpxchg(). The ordering is as follows: task_work_add() get_signal() -------------------------------------------------------------- STORE(task->task_works, new_work); STORE(task->jobctl); mb(); mb(); LOAD(task->jobctl); LOAD(task->task_works); This speeds up TWA_SIGNAL handling quite a bit, which is important now that io_uring is relying on it for all task_work deliveries. Cc: Peter Zijlstra Cc: Jann Horn Acked-by: Oleg Nesterov Signed-off-by: Jens Axboe --- kernel/signal.c | 16 +++++++++++++++- kernel/task_work.c | 8 +++++++- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6f16f7c5d375..42b67d2cea37 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2541,7 +2541,21 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; + /* + * Make sure we can safely read ->jobctl() in task_work add. As Oleg + * states: + * + * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we + * roughly have + * + * task_work_add: get_signal: + * STORE(task->task_works, new_work); STORE(task->jobctl); + * mb(); mb(); + * LOAD(task->jobctl); LOAD(task->task_works); + * + * and we can rely on STORE-MB-LOAD [ in task_work_add]. + */ + smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK); if (unlikely(current->task_works)) { spin_unlock_irq(&sighand->siglock); task_work_run(); diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c0848ca1287..613b2d634af8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -42,7 +42,13 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) set_notify_resume(task); break; case TWA_SIGNAL: - if (lock_task_sighand(task, &flags)) { + /* + * Only grab the sighand lock if we don't already have some + * task_work pending. This pairs with the smp_store_mb() + * in get_signal(), see comment there. + */ + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && + lock_task_sighand(task, &flags)) { task->jobctl |= JOBCTL_TASK_WORK; signal_wake_up(task, 0); unlock_task_sighand(task, &flags); -- cgit v1.2.3 From 405fa8ac89e7aaa87282df659e525992f2639e76 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Aug 2020 08:21:17 -0400 Subject: futex: Convert to use the preferred 'fallthrough' macro Signed-off-by: Miaohe Lin Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200813122117.51173-1-linmiaohe@huawei.com --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 61e8153e6c76..a5876694a60e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3744,12 +3744,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; - /* fall through */ + fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; - /* fall through */ + fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: -- cgit v1.2.3 From cc172ff301d8079e941a6eb31758951a6d764084 Mon Sep 17 00:00:00 2001 From: Libing Zhou Date: Fri, 14 Aug 2020 11:02:36 +0800 Subject: sched/debug: Fix the alignment of the show-state debug output Current sysrq(t) output task fields name are not aligned with actual task fields value, e.g.: kernel: sysrq: Show State kernel: task PC stack pid father kernel: systemd S12456 1 0 0x00000000 kernel: Call Trace: kernel: ? __schedule+0x240/0x740 To make it more readable, print fields name together with task fields value in the same line, with fixed width: kernel: sysrq: Show State kernel: task:systemd state:S stack:12920 pid: 1 ppid: 0 flags:0x00000000 kernel: Call Trace: kernel: __schedule+0x282/0x620 Signed-off-by: Libing Zhou Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200814030236.37835-1-libing.zhou@nokia-sbell.com --- kernel/sched/core.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a0e7b449b88..09fd62568ba9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6387,10 +6387,10 @@ void sched_show_task(struct task_struct *p) if (!try_get_task_stack(p)) return; - printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); if (p->state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -6399,8 +6399,8 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, + pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", + free, task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); @@ -6435,13 +6435,6 @@ void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif rcu_read_lock(); for_each_process_thread(g, p) { /* -- cgit v1.2.3 From 9420139f516d7fbc248ce17f35275cb005ed98ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Aug 2020 12:26:24 +0200 Subject: dma-pool: fix coherent pool allocations for IOMMU mappings When allocating coherent pool memory for an IOMMU mapping we don't care about the DMA mask. Move the guess for the initial GFP mask into the dma_direct_alloc_pages and pass dma_coherent_ok as a function pointer argument so that it doesn't get applied to the IOMMU case. Signed-off-by: Christoph Hellwig Tested-by: Amit Pundir --- kernel/dma/direct.c | 13 ++++-- kernel/dma/pool.c | 114 ++++++++++++++++++++++------------------------------ 2 files changed, 57 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index bb0041e99659..db6ef07aec3b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,7 +43,7 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, +static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); @@ -68,7 +68,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); @@ -161,8 +161,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_should_alloc_from_pool(dev, gfp, attrs)) { - ret = dma_alloc_from_pool(dev, size, &page, gfp); - if (!ret) + u64 phys_mask; + + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + page = dma_alloc_from_pool(dev, size, &ret, gfp, + dma_coherent_ok); + if (!page) return NULL; goto done; } diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 6bc74a2d5127..5d071d4a3cba 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -196,93 +196,75 @@ static int __init dma_atomic_pool_init(void) } postcore_initcall(dma_atomic_pool_init); -static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev) +static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) { - u64 phys_mask; - gfp_t gfp; - - gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); - if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + if (prev == NULL) { + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) + return atomic_pool_dma32; + if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + return atomic_pool_dma; + return atomic_pool_kernel; + } + if (prev == atomic_pool_kernel) + return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma; + if (prev == atomic_pool_dma32) return atomic_pool_dma; - if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) - return atomic_pool_dma32; - return atomic_pool_kernel; + return NULL; } -static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool) +static struct page *__dma_alloc_from_pool(struct device *dev, size_t size, + struct gen_pool *pool, void **cpu_addr, + bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)) { - if (bad_pool == atomic_pool_kernel) - return atomic_pool_dma32 ? : atomic_pool_dma; + unsigned long addr; + phys_addr_t phys; - if (bad_pool == atomic_pool_dma32) - return atomic_pool_dma; + addr = gen_pool_alloc(pool, size); + if (!addr) + return NULL; - return NULL; -} + phys = gen_pool_virt_to_phys(pool, addr); + if (phys_addr_ok && !phys_addr_ok(dev, phys, size)) { + gen_pool_free(pool, addr, size); + return NULL; + } -static inline struct gen_pool *dma_guess_pool(struct device *dev, - struct gen_pool *bad_pool) -{ - if (bad_pool) - return dma_get_safer_pool(bad_pool); + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); - return dma_guess_pool_from_device(dev); + *cpu_addr = (void *)addr; + memset(*cpu_addr, 0, size); + return pfn_to_page(__phys_to_pfn(phys)); } -void *dma_alloc_from_pool(struct device *dev, size_t size, - struct page **ret_page, gfp_t flags) +struct page *dma_alloc_from_pool(struct device *dev, size_t size, + void **cpu_addr, gfp_t gfp, + bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)) { struct gen_pool *pool = NULL; - unsigned long val = 0; - void *ptr = NULL; - phys_addr_t phys; - - while (1) { - pool = dma_guess_pool(dev, pool); - if (!pool) { - WARN(1, "Failed to get suitable pool for %s\n", - dev_name(dev)); - break; - } - - val = gen_pool_alloc(pool, size); - if (!val) - continue; - - phys = gen_pool_virt_to_phys(pool, val); - if (dma_coherent_ok(dev, phys, size)) - break; - - gen_pool_free(pool, val, size); - val = 0; - } - - - if (val) { - *ret_page = pfn_to_page(__phys_to_pfn(phys)); - ptr = (void *)val; - memset(ptr, 0, size); + struct page *page; - if (gen_pool_avail(pool) < atomic_pool_size) - schedule_work(&atomic_pool_work); + while ((pool = dma_guess_pool(pool, gfp))) { + page = __dma_alloc_from_pool(dev, size, pool, cpu_addr, + phys_addr_ok); + if (page) + return page; } - return ptr; + WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev)); + return NULL; } bool dma_free_from_pool(struct device *dev, void *start, size_t size) { struct gen_pool *pool = NULL; - while (1) { - pool = dma_guess_pool(dev, pool); - if (!pool) - return false; - - if (gen_pool_has_addr(pool, (unsigned long)start, size)) { - gen_pool_free(pool, (unsigned long)start, size); - return true; - } + while ((pool = dma_guess_pool(pool, 0))) { + if (!gen_pool_has_addr(pool, (unsigned long)start, size)) + continue; + gen_pool_free(pool, (unsigned long)start, size); + return true; } + + return false; } -- cgit v1.2.3 From d7e673ec2c8e0ea39c4c70fc490d67d7fbda869d Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 14 Aug 2020 12:26:23 +0200 Subject: dma-pool: Only allocate from CMA when in same memory zone There is no guarantee to CMA's placement, so allocating a zone specific atomic pool from CMA might return memory from a completely different memory zone. To get around this double check CMA's placement before allocating from it. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5d071d4a3cba..06582b488e31 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -3,7 +3,9 @@ * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2020 Google LLC */ +#include #include +#include #include #include #include @@ -55,6 +57,29 @@ static void dma_atomic_pool_size_add(gfp_t gfp, size_t size) pool_size_kernel += size; } +static bool cma_in_zone(gfp_t gfp) +{ + unsigned long size; + phys_addr_t end; + struct cma *cma; + + cma = dev_get_cma_area(NULL); + if (!cma) + return false; + + size = cma_get_size(cma); + if (!size) + return false; + + /* CMA can't cross zone boundaries, see cma_activate_area() */ + end = cma_get_base(cma) + size - 1; + if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + return end <= DMA_BIT_MASK(zone_dma_bits); + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) + return end <= DMA_BIT_MASK(32); + return true; +} + static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, gfp_t gfp) { @@ -68,7 +93,11 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, do { pool_size = 1 << (PAGE_SHIFT + order); - page = alloc_pages(gfp, order); + if (cma_in_zone(gfp)) + page = dma_alloc_from_contiguous(NULL, 1 << order, + order, false); + if (!page) + page = alloc_pages(gfp, order); } while (!page && order-- > 0); if (!page) goto out; -- cgit v1.2.3 From a85ffd59bd362d6c2e456e7f523b091830cd5454 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 12 Aug 2020 20:17:00 -0700 Subject: dma-debug: fix debug_dma_assert_idle(), use rcu_read_lock() Since commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") improved unlock_page(), it has become more noticeable how cow_user_page() in a kernel with CONFIG_DMA_API_DEBUG=y can create and suffer from heavy contention on DMA debug's radix_lock in debug_dma_assert_idle(). It is only doing a lookup: use rcu_read_lock() and rcu_read_unlock() instead; though that does require the static ents[] to be moved onstack... ...but, hold on, isn't that radix_tree_gang_lookup() and loop doing quite the wrong thing: searching CACHELINES_PER_PAGE entries for an exact match with the first cacheline of the page in question? radix_tree_gang_lookup() is the right tool for the job, but we need nothing more than to check the first entry it can find, reporting if that falls anywhere within the page. (Is RCU safe here? As safe as using the spinlock was. The entries are never freed, so don't need to be freed by RCU. They may be reused, and there is a faint chance of a race, with an offending entry reused while printing its error info; but the spinlock did not prevent that either, and I agree that it's not worth worrying about. ] [ Side noe: this patch is a clear improvement to the status quo, but the next patch will be removing this debug function entirely. But just in case we decide we want to resurrect the debugging code some day, I'm first applying this improvement patch so that it doesn't get lost - Linus ] Fixes: 3b7a6418c749 ("dma debug: account for cachelines and read-only mappings in overlap tracking") Signed-off-by: Hugh Dickins Acked-by: Dan Williams Acked-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- kernel/dma/debug.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f7f807fb6ebb..6f4f4b9d2d03 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -565,11 +565,8 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) */ void debug_dma_assert_idle(struct page *page) { - static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; - struct dma_debug_entry *entry = NULL; - void **results = (void **) &ents; - unsigned int nents, i; - unsigned long flags; + struct dma_debug_entry *entry; + unsigned long pfn; phys_addr_t cln; if (dma_debug_disabled()) @@ -578,20 +575,14 @@ void debug_dma_assert_idle(struct page *page) if (!page) return; - cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; - spin_lock_irqsave(&radix_lock, flags); - nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, - CACHELINES_PER_PAGE); - for (i = 0; i < nents; i++) { - phys_addr_t ent_cln = to_cacheline_number(ents[i]); + pfn = page_to_pfn(page); + cln = (phys_addr_t) pfn << CACHELINE_PER_PAGE_SHIFT; - if (ent_cln == cln) { - entry = ents[i]; - break; - } else if (ent_cln >= cln + CACHELINES_PER_PAGE) - break; - } - spin_unlock_irqrestore(&radix_lock, flags); + rcu_read_lock(); + if (!radix_tree_gang_lookup(&dma_active_cacheline, (void **) &entry, + cln, 1) || entry->pfn != pfn) + entry = NULL; + rcu_read_unlock(); if (!entry) return; -- cgit v1.2.3 From 5848dc5b1b76d83599dcec1b39f188a7cbdca7e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Aug 2020 15:22:43 -0700 Subject: dma-debug: remove debug_dma_assert_idle() function This remoes the code from the COW path to call debug_dma_assert_idle(), which was added many years ago. Google shows that it hasn't caught anything in the 6+ years we've had it apart from a false positive, and Hugh just noticed how it had a very unfortunate spinlock serialization in the COW path. He fixed that issue the previous commit (a85ffd59bd36: "dma-debug: fix debug_dma_assert_idle(), use rcu_read_lock()"), but let's see if anybody even notices when we remove this function entirely. NOTE! We keep the dma tracking infrastructure that was added by the commit that introduced it. Partly to make it easier to resurrect this debug code if we ever deside to, and partly because that tracking by pfn and offset looks quite reasonable. The problem with this debug code was simply that it was expensive and didn't seem worth it, not that it was wrong per se. Acked-by: Dan Williams Acked-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- kernel/dma/Kconfig | 5 ----- kernel/dma/debug.c | 46 +--------------------------------------------- 2 files changed, 1 insertion(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index f4770fcfa62b..5732b2b3ef17 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -186,11 +186,6 @@ config DMA_API_DEBUG drivers like double-freeing of DMA mappings or freeing mappings that were never allocated. - This also attempts to catch cases where a page owned by DMA is - accessed by the cpu in a way that could cause data corruption. For - example, this enables cow_user_page() to check that the source page is - not undergoing DMA. - This option causes a performance degradation. Use only if you want to debug device drivers and dma interactions. diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 6f4f4b9d2d03..8e9f7b301c6d 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -448,9 +448,6 @@ void debug_dma_dump_mappings(struct device *dev) * dma_active_cacheline entry to track per event. dma_map_sg(), on the * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. - * - * At any time debug_dma_assert_idle() can be called to trigger a - * warning if any cachelines in the given page are in the active set. */ static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); static DEFINE_SPINLOCK(radix_lock); @@ -497,10 +494,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln) overlap = active_cacheline_set_overlap(cln, ++overlap); /* If we overflowed the overlap counter then we're potentially - * leaking dma-mappings. Otherwise, if maps and unmaps are - * balanced then this overflow may cause false negatives in - * debug_dma_assert_idle() as the cacheline may be marked idle - * prematurely. + * leaking dma-mappings. */ WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"), @@ -555,44 +549,6 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) spin_unlock_irqrestore(&radix_lock, flags); } -/** - * debug_dma_assert_idle() - assert that a page is not undergoing dma - * @page: page to lookup in the dma_active_cacheline tree - * - * Place a call to this routine in cases where the cpu touching the page - * before the dma completes (page is dma_unmapped) will lead to data - * corruption. - */ -void debug_dma_assert_idle(struct page *page) -{ - struct dma_debug_entry *entry; - unsigned long pfn; - phys_addr_t cln; - - if (dma_debug_disabled()) - return; - - if (!page) - return; - - pfn = page_to_pfn(page); - cln = (phys_addr_t) pfn << CACHELINE_PER_PAGE_SHIFT; - - rcu_read_lock(); - if (!radix_tree_gang_lookup(&dma_active_cacheline, (void **) &entry, - cln, 1) || entry->pfn != pfn) - entry = NULL; - rcu_read_unlock(); - - if (!entry) - return; - - cln = to_cacheline_number(entry); - err_printk(entry->dev, entry, - "cpu touching an active dma mapped cacheline [cln=%pa]\n", - &cln); -} - /* * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. -- cgit v1.2.3 From 846f9e1fb9b9a2c4eecefa2fdbfb51e975a7d532 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jul 2020 14:18:54 +0200 Subject: dma-mapping: consolidate the NO_DMA definition in kernel/dma/Kconfig Have a single definition that architetures can select. Signed-off-by: Christoph Hellwig Signed-off-by: Rich Felker --- kernel/dma/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 1da3f44f2565..57533d07676f 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +config NO_DMA + bool + config HAS_DMA bool depends on !NO_DMA -- cgit v1.2.3 From 88db0aa2421666d2f73486d15b239a4521983d55 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 14 Aug 2020 17:31:07 -0700 Subject: all arch: remove system call sys_sysctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 61a47c1ad3a4dc ("sysctl: Remove the sysctl system call"), sys_sysctl is actually unavailable: any input can only return an error. We have been warning about people using the sysctl system call for years and believe there are no more users. Even if there are users of this interface if they have not complained or fixed their code by now they probably are not going to, so there is no point in warning them any longer. So completely remove sys_sysctl on all architectures. [nixiaoming@huawei.com: s390: fix build error for sys_call_table_emu] Link: http://lkml.kernel.org/r/20200618141426.16884-1-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Signed-off-by: Andrew Morton Acked-by: Will Deacon [arm/arm64] Acked-by: "Eric W. Biederman" Cc: Aleksa Sarai Cc: Alexander Shishkin Cc: Al Viro Cc: Andi Kleen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bin Meng Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: chenzefeng Cc: Christian Borntraeger Cc: Christian Brauner Cc: Chris Zankel Cc: David Howells Cc: David S. Miller Cc: Diego Elio Pettenò Cc: Dmitry Vyukov Cc: Dominik Brodowski Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Iurii Zaikin Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: Jiri Olsa Cc: Kars de Jong Cc: Kees Cook Cc: Krzysztof Kozlowski Cc: Luis Chamberlain Cc: Marco Elver Cc: Mark Rutland Cc: Martin K. Petersen Cc: Masahiro Yamada Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Miklos Szeredi Cc: Minchan Kim Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Nick Piggin Cc: Oleg Nesterov Cc: Olof Johansson Cc: Paul Burton Cc: "Paul E. McKenney" Cc: Paul Mackerras Cc: Peter Zijlstra (Intel) Cc: Randy Dunlap Cc: Ravi Bangoria Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sami Tolvanen Cc: Sargun Dhillon Cc: Stephen Rothwell Cc: Sudeep Holla Cc: Sven Schnelle Cc: Thiago Jung Bauermann Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Yoshinori Sato Cc: Zhou Yanjie Link: http://lkml.kernel.org/r/20200616030734.87257-1-nixiaoming@huawei.com Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/sys_ni.c | 1 - kernel/sysctl_binary.c | 171 ------------------------------------------------- 3 files changed, 1 insertion(+), 173 deletions(-) delete mode 100644 kernel/sysctl_binary.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index b3da548691c9..9a20016d4900 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,7 +5,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ + sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 3b69a560a7ac..4d59775ea79c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -364,7 +364,6 @@ COND_SYSCALL(socketcall); COND_SYSCALL_COMPAT(socketcall); /* compat syscalls for arm64, x86, ... */ -COND_SYSCALL_COMPAT(sysctl); COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c deleted file mode 100644 index 7d550cc76a3b..000000000000 --- a/kernel/sysctl_binary.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -static void deprecated_sysctl_warning(const int *name, int nlen) -{ - int i; - - /* - * CTL_KERN/KERN_VERSION is used by older glibc and cannot - * ever go away. - */ - if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) - return; - - if (printk_ratelimit()) { - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < nlen; i++) - printk(KERN_CONT "%d.", name[i]); - printk(KERN_CONT "\n"); - } - return; -} - -#define WARN_ONCE_HASH_BITS 8 -#define WARN_ONCE_HASH_SIZE (1<nlen. */ - if (nlen < 0 || nlen > CTL_MAXNAME) - return -ENOTDIR; - /* Read in the sysctl name for simplicity */ - for (i = 0; i < nlen; i++) - if (get_user(name[i], args_name + i)) - return -EFAULT; - - warn_on_bintable(name, nlen); - - return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, - tmp.newval, tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - return result; -} - - -#ifdef CONFIG_COMPAT - -struct compat_sysctl_args { - compat_uptr_t name; - int nlen; - compat_uptr_t oldval; - compat_uptr_t oldlenp; - compat_uptr_t newval; - compat_size_t newlen; - compat_ulong_t __unused[4]; -}; - -COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) -{ - struct compat_sysctl_args tmp; - compat_size_t __user *compat_oldlenp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - compat_oldlenp = compat_ptr(tmp.oldlenp); - if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) - return -EFAULT; - - result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, - compat_ptr(tmp.oldval), oldlen, - compat_ptr(tmp.newval), tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) - return -EFAULT; - - return result; -} - -#endif /* CONFIG_COMPAT */ -- cgit v1.2.3 From 29e44f4535faa71a70827af3639b5e6762d8f02a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 17 Aug 2020 11:07:28 +0100 Subject: watch_queue: Limit the number of watches a user can hold Impose a limit on the number of watches that a user can hold so that they can't use this mechanism to fill up all the available memory. This is done by putting a counter in user_struct that's incremented when a watch is allocated and decreased when it is released. If the number exceeds the RLIMIT_NOFILE limit, the watch is rejected with EAGAIN. This can be tested by the following means: (1) Create a watch queue and attach it to fd 5 in the program given - in this case, bash: keyctl watch_session /tmp/nlog /tmp/gclog 5 bash (2) In the shell, set the maximum number of files to, say, 99: ulimit -n 99 (3) Add 200 keyrings: for ((i=0; i<200; i++)); do keyctl newring a$i @s || break; done (4) Try to watch all of the keyrings: for ((i=0; i<200; i++)); do echo $i; keyctl watch_add 5 %:a$i || break; done This should fail when the number of watches belonging to the user hits 99. (5) Remove all the keyrings and all of those watches should go away: for ((i=0; i<200; i++)); do keyctl unlink %:a$i; done (6) Kill off the watch queue by exiting the shell spawned by watch_session. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Linus Torvalds Signed-off-by: David Howells Reviewed-by: Jarkko Sakkinen Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index f74020f6bd9d..0ef8f65bd2d7 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -393,6 +393,7 @@ static void free_watch(struct rcu_head *rcu) struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); + atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); } @@ -452,6 +453,13 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) watch->cred = get_current_cred(); rcu_assign_pointer(watch->watch_list, wlist); + if (atomic_inc_return(&watch->cred->user->nr_watches) > + task_rlimit(current, RLIMIT_NOFILE)) { + atomic_dec(&watch->cred->user->nr_watches); + put_cred(watch->cred); + return -EAGAIN; + } + spin_lock_bh(&wqueue->lock); kref_get(&wqueue->usage); kref_get(&watch->usage); -- cgit v1.2.3 From cf28f3bbfca097d956f9021cb710dfad56adcc62 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 17 Aug 2020 10:42:14 -0700 Subject: bpf: Use get_file_rcu() instead of get_file() for task_file iterator With latest `bpftool prog` command, we observed the following kernel panic. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page PGD dfe894067 P4D dfe894067 PUD deb663067 PMD 0 Oops: 0010 [#1] SMP CPU: 9 PID: 6023 ... RIP: 0010:0x0 Code: Bad RIP value. RSP: 0000:ffffc900002b8f18 EFLAGS: 00010286 RAX: ffff8883a405f400 RBX: ffff888e46a6bf00 RCX: 000000008020000c RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8883a405f400 RBP: ffff888e46a6bf50 R08: 0000000000000000 R09: ffffffff81129600 R10: ffff8883a405f300 R11: 0000160000000000 R12: 0000000000002710 R13: 000000e9494b690c R14: 0000000000000202 R15: 0000000000000009 FS: 00007fd9187fe700(0000) GS:ffff888e46a40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000de5d33002 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rcu_core+0x1a4/0x440 __do_softirq+0xd3/0x2c8 irq_exit+0x9d/0xa0 smp_apic_timer_interrupt+0x68/0x120 apic_timer_interrupt+0xf/0x20 RIP: 0033:0x47ce80 Code: Bad RIP value. RSP: 002b:00007fd9187fba40 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13 RAX: 0000000000000002 RBX: 00007fd931789160 RCX: 000000000000010c RDX: 00007fd9308cdfb4 RSI: 00007fd9308cdfb4 RDI: 00007ffedd1ea0a8 RBP: 00007fd9187fbab0 R08: 000000000000000e R09: 000000000000002a R10: 0000000000480210 R11: 00007fd9187fc570 R12: 00007fd9316cc400 R13: 0000000000000118 R14: 00007fd9308cdfb4 R15: 00007fd9317a9380 After further analysis, the bug is triggered by Commit eaaacd23910f ("bpf: Add task and task/file iterator targets") which introduced task_file bpf iterator, which traverses all open file descriptors for all tasks in the current namespace. The latest `bpftool prog` calls a task_file bpf program to traverse all files in the system in order to associate processes with progs/maps, etc. When traversing files for a given task, rcu read_lock is taken to access all files in a file_struct. But it used get_file() to grab a file, which is not right. It is possible file->f_count is 0 and get_file() will unconditionally increase it. Later put_file() may cause all kind of issues with the above as one of sympotoms. The failure can be reproduced with the following steps in a few seconds: $ cat t.c #include #include #include #include #include #define N 10000 int fd[N]; int main() { int i; for (i = 0; i < N; i++) { fd[i] = open("./note.txt", 'r'); if (fd[i] < 0) { fprintf(stderr, "failed\n"); return -1; } } for (i = 0; i < N; i++) close(fd[i]); return 0; } $ gcc -O2 t.c $ cat run.sh #/bin/bash for i in {1..100} do while true; do ./a.out; done & done $ ./run.sh $ while true; do bpftool prog >& /dev/null; done This patch used get_file_rcu() which only grabs a file if the file->f_count is not zero. This is to ensure the file pointer is always valid. The above reproducer did not fail for more than 30 minutes. Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Suggested-by: Josef Bacik Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Reviewed-by: Josef Bacik Link: https://lore.kernel.org/bpf/20200817174214.252601-1-yhs@fb.com --- kernel/bpf/task_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 232df29793e9..f21b5e1e4540 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -178,10 +178,11 @@ again: f = fcheck_files(curr_files, curr_fd); if (!f) continue; + if (!get_file_rcu(f)) + continue; /* set info->fd */ info->fd = curr_fd; - get_file(f); rcu_read_unlock(); return f; } -- cgit v1.2.3 From e679654a704e5bd676ea6446fa7b764cbabf168a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 18 Aug 2020 15:23:09 -0700 Subject: bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator In our production system, we observed rcu stalls when 'bpftool prog` is running. rcu: INFO: rcu_sched self-detected stall on CPU rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913 \x09(t=21031 jiffies g=2534773 q=179750) NMI backtrace for cpu 7 CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019 Call Trace: dump_stack+0x57/0x70 nmi_cpu_backtrace.cold+0x14/0x53 ? lapic_can_unplug_cpu.cold+0x39/0x39 nmi_trigger_cpumask_backtrace+0xb7/0xc7 rcu_dump_cpu_stacks+0xa2/0xd0 rcu_sched_clock_irq.cold+0x1ff/0x3d9 ? tick_nohz_handler+0x100/0x100 update_process_times+0x5b/0x90 tick_sched_timer+0x5e/0xf0 __hrtimer_run_queues+0x12a/0x2a0 hrtimer_interrupt+0x10e/0x280 __sysvec_apic_timer_interrupt+0x51/0xe0 asm_call_on_stack+0xf/0x20 sysvec_apic_timer_interrupt+0x6f/0x80 asm_sysvec_apic_timer_interrupt+0x12/0x20 RIP: 0010:task_file_seq_get_next+0x71/0x220 Code: 00 00 8b 53 1c 49 8b 7d 00 89 d6 48 8b 47 20 44 8b 18 41 39 d3 76 75 48 8b 4f 20 8b 01 39 d0 76 61 41 89 d1 49 39 c1 48 19 c0 <48> 8b 49 08 21 d0 48 8d 04 c1 4c 8b 08 4d 85 c9 74 46 49 8b 41 38 RSP: 0018:ffffc90006223e10 EFLAGS: 00000297 RAX: ffffffffffffffff RBX: ffff888f0d172388 RCX: ffff888c8c07c1c0 RDX: 00000000000f017b RSI: 00000000000f017b RDI: ffff888c254702c0 RBP: ffffc90006223e68 R08: ffff888be2a1c140 R09: 00000000000f017b R10: 0000000000000002 R11: 0000000000100000 R12: ffff888f23c24118 R13: ffffc90006223e60 R14: ffffffff828509a0 R15: 00000000ffffffff task_file_seq_next+0x52/0xa0 bpf_seq_read+0xb9/0x320 vfs_read+0x9d/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x60 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f8815f4f76e Code: c0 e9 f6 fe ff ff 55 48 8d 3d 76 70 0a 00 48 89 e5 e8 36 06 02 00 66 0f 1f 44 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 52 c3 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 RSP: 002b:00007fff8f9df578 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 000000000170b9c0 RCX: 00007f8815f4f76e RDX: 0000000000001000 RSI: 00007fff8f9df5b0 RDI: 0000000000000007 RBP: 00007fff8f9e05f0 R08: 0000000000000049 R09: 0000000000000010 R10: 00007f881601fa40 R11: 0000000000000246 R12: 00007fff8f9e05a8 R13: 00007fff8f9e05a8 R14: 0000000001917f90 R15: 000000000000e22e Note that `bpftool prog` actually calls a task_file bpf iterator program to establish an association between prog/map/link/btf anon files and processes. In the case where the above rcu stall occured, we had a process having 1587 tasks and each task having roughly 81305 files. This implied 129 million bpf prog invocations. Unfortunwtely none of these files are prog/map/link/btf files so bpf iterator/prog needs to traverse all these files and not able to return to user space since there are no seq_file buffer overflow. This patch fixed the issue in bpf_seq_read() to limit the number of visited objects. If the maximum number of visited objects is reached, no more objects will be visited in the current syscall. If there is nothing written in the seq_file buffer, -EAGAIN will return to the user so user can try again. The maximum number of visited objects is set at 1 million. In our Intel Xeon D-2191 2.3GHZ 18-core server, bpf_seq_read() visiting 1 million files takes around 0.18 seconds. We did not use cond_resched() since for some iterators, e.g., netlink iterator, where rcu read_lock critical section spans between consecutive seq_ops->next(), which makes impossible to do cond_resched() in the key while loop of function bpf_seq_read(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Cc: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200818222309.2181348-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b6715964b685..8faa2ce89396 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -67,6 +67,9 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +/* maximum visited objects before bailing out */ +#define MAX_ITER_OBJECTS 1000000 + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -79,7 +82,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, { struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; - int err = 0; + int err = 0, num_objs = 0; void *p; mutex_lock(&seq->lock); @@ -135,6 +138,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, while (1) { loff_t pos = seq->index; + num_objs++; offs = seq->count; p = seq->op->next(seq, p, &seq->index); if (pos == seq->index) { @@ -153,6 +157,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, if (seq->count >= size) break; + if (num_objs >= MAX_ITER_OBJECTS) { + if (offs == 0) { + err = -EAGAIN; + seq->op->stop(seq, p); + goto done; + } + break; + } + err = seq->op->show(seq, p); if (err > 0) { bpf_iter_dec_seq_num(seq); -- cgit v1.2.3 From e60572b8d4c39572be6857d1ec91fdf979f8775f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 18 Aug 2020 15:23:10 -0700 Subject: bpf: Avoid visit same object multiple times Currently when traversing all tasks, the next tid is always increased by one. This may result in visiting the same task multiple times in a pid namespace. This patch fixed the issue by seting the next tid as pid_nr_ns(pid, ns) + 1, similar to funciton next_tgid(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Cc: Rik van Riel Link: https://lore.kernel.org/bpf/20200818222310.2181500-1-yhs@fb.com --- kernel/bpf/task_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index f21b5e1e4540..99af4cea1102 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -29,8 +29,9 @@ static struct task_struct *task_seq_get_next(struct pid_namespace *ns, rcu_read_lock(); retry: - pid = idr_get_next(&ns->idr, tid); + pid = find_ge_pid(*tid, ns); if (pid) { + *tid = pid_nr_ns(pid, ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; -- cgit v1.2.3 From d88d59b64ca35abae208e2781fdb45e69cbed56c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2020 21:44:39 +0200 Subject: core/entry: Respect syscall number rewrites The transcript of the x86 entry code to the generic version failed to reload the syscall number from ptregs after ptrace and seccomp have run, which both can modify the syscall number in ptregs. It returns the original syscall number instead which is obviously not the right thing to do. Reload the syscall number to fix that. Fixes: 142781e108b1 ("entry: Provide generic syscall entry functionality") Reported-by: Kyle Huey Signed-off-by: Thomas Gleixner Tested-by: Kyle Huey Tested-by: Kees Cook Acked-by: Kees Cook Link: https://lore.kernel.org/r/87blj6ifo8.fsf@nanos.tec.linutronix.de --- kernel/entry/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9852e0d62d95..fcae019158ca 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -65,7 +65,8 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, syscall_enter_audit(regs, syscall); - return ret ? : syscall; + /* The above might have changed the syscall number */ + return ret ? : syscall_get_nr(current, regs); } noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) -- cgit v1.2.3 From 71e843295c680898959b22dc877ae3839cc22470 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 20 Aug 2020 17:42:14 -0700 Subject: kernel/relay.c: fix memleak on destroy relay channel kmemleak report memory leak as follows: unreferenced object 0x607ee4e5f948 (size 8): comm "syz-executor.1", pid 2098, jiffies 4295031601 (age 288.468s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: relay_open kernel/relay.c:583 [inline] relay_open+0xb6/0x970 kernel/relay.c:563 do_blk_trace_setup+0x4a8/0xb20 kernel/trace/blktrace.c:557 __blk_trace_setup+0xb6/0x150 kernel/trace/blktrace.c:597 blk_trace_ioctl+0x146/0x280 kernel/trace/blktrace.c:738 blkdev_ioctl+0xb2/0x6a0 block/ioctl.c:613 block_ioctl+0xe5/0x120 fs/block_dev.c:1871 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x170/0x1ce fs/ioctl.c:739 do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 'chan->buf' is malloced in relay_open() by alloc_percpu() but not free while destroy the relay channel. Fix it by adding free_percpu() before return from relay_destroy_channel(). Fixes: 017c59c042d0 ("relay: Use per CPU constructs for the relay channel buffer pointers") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Reviewed-by: Chris Wilson Cc: Al Viro Cc: Michael Ellerman Cc: David Rientjes Cc: Michel Lespinasse Cc: Daniel Axtens Cc: Thomas Gleixner Cc: Akash Goel Cc: Link: http://lkml.kernel.org/r/20200817122826.48518-1-weiyongjun1@huawei.com Signed-off-by: Linus Torvalds --- kernel/relay.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 72fe443ea78f..fb4e0c530c08 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -197,6 +197,7 @@ free_buf: static void relay_destroy_channel(struct kref *kref) { struct rchan *chan = container_of(kref, struct rchan, kref); + free_percpu(chan->buf); kfree(chan); } -- cgit v1.2.3 From c17c3dc9d08b9aad9a55a1e53f205187972f448e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 20 Aug 2020 17:42:17 -0700 Subject: uprobes: __replace_page() avoid BUG in munlock_vma_page() syzbot crashed on the VM_BUG_ON_PAGE(PageTail) in munlock_vma_page(), when called from uprobes __replace_page(). Which of many ways to fix it? Settled on not calling when PageCompound (since Head and Tail are equals in this context, PageCompound the usual check in uprobes.c, and the prior use of FOLL_SPLIT_PMD will have cleared PageMlocked already). Fixes: 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") Reported-by: syzbot Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Reviewed-by: Srikar Dronamraju Acked-by: Song Liu Acked-by: Oleg Nesterov Cc: "Kirill A. Shutemov" Cc: [5.4+] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008161338360.20413@eggly.anvils Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 649fd53dc9ad..0e18aaf23a7b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -205,7 +205,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) munlock_vma_page(old_page); put_page(old_page); -- cgit v1.2.3 From df561f6688fef775baa341a0f5d960becd248b11 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 23 Aug 2020 17:36:59 -0500 Subject: treewide: Use fallthrough pseudo-keyword Replace the existing /* fall through */ comments and its variants with the new pseudo-keyword macro fallthrough[1]. Also, remove unnecessary fall-through markings when it is the case. [1] https://www.kernel.org/doc/html/v5.7/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through Signed-off-by: Gustavo A. R. Silva --- kernel/auditfilter.c | 2 +- kernel/bpf/cgroup.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 4 ++-- kernel/capability.c | 2 +- kernel/compat.c | 6 +++--- kernel/debug/gdbstub.c | 6 +++--- kernel/debug/kdb/kdb_keyboard.c | 4 ++-- kernel/debug/kdb/kdb_support.c | 6 +++--- kernel/events/core.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/manage.c | 4 ++-- kernel/kallsyms.c | 4 ++-- kernel/power/hibernate.c | 2 +- kernel/power/qos.c | 4 ++-- kernel/sched/core.c | 2 +- kernel/sched/topology.c | 6 +++--- kernel/signal.c | 2 +- kernel/sys.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-timers.c | 4 ++-- kernel/time/tick-broadcast.c | 2 +- kernel/time/timer.c | 2 +- kernel/trace/blktrace.c | 2 +- kernel/trace/trace_events_filter.c | 4 ++-- 26 files changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a10e2997aa6c..333b3bcfc545 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -681,7 +681,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->values[i] = AUDIT_UID_UNSET; break; } - /* fall through - if set */ + fallthrough; /* if set */ default: data->values[i] = f->val; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 83ff127ef7ae..e21de4f1754c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1794,7 +1794,7 @@ static bool cg_sockopt_is_valid_access(int off, int size, return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; case offsetof(struct bpf_sockopt, optname): - /* fallthrough */ + fallthrough; case offsetof(struct bpf_sockopt, level): if (size != size_default) return false; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f1c46529929b..6386b7bb98f2 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -279,7 +279,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, break; default: bpf_warn_invalid_xdp_action(act); - /* fallthrough */ + fallthrough; case XDP_DROP: xdp_return_frame(xdpf); stats->drop++; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 86299a292214..1bf960aa615c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2029,7 +2029,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; - /* fallthrough */ + fallthrough; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef938f17b944..47e74f09fa37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5236,7 +5236,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, off_reg == dst_reg ? dst : src); return -EACCES; } - /* fall-through */ + fallthrough; default: break; } @@ -10988,7 +10988,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) default: if (!prog_extension) return -EINVAL; - /* fallthrough */ + fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: case BPF_TRACE_FENTRY: diff --git a/kernel/capability.c b/kernel/capability.c index 1444f3954d75..7c59b096c98a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -93,7 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); - /* fall through - v3 is otherwise equivalent to v2. */ + fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; diff --git a/kernel/compat.c b/kernel/compat.c index b8d2800bb4b7..05adfd6fa8bf 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -255,11 +255,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return -EFAULT; switch (_NSIG_WORDS) { case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); - /* fall through */ + fallthrough; case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); - /* fall through */ + fallthrough; case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); - /* fall through */ + fallthrough; case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } #else diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a790026e42d0..cc3c43dfec44 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1046,14 +1046,14 @@ int gdb_serial_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } #endif - /* Fall through */ + fallthrough; case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) goto default_handle; if (tmp == 0) break; - /* Fall through - on tmp < 0 */ + fallthrough; /* on tmp < 0 */ case 'c': /* Continue packet */ case 's': /* Single step packet */ if (kgdb_contthread && kgdb_contthread != current) { @@ -1062,7 +1062,7 @@ int gdb_serial_stub(struct kgdb_state *ks) break; } dbg_activate_sw_breakpoints(); - /* Fall through - to default processing */ + fallthrough; /* to default processing */ default: default_handle: error = kgdb_arch_handle_exception(ks->ex_vector, diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 750497b0003a..f877a0a0d7cf 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -173,11 +173,11 @@ int kdb_get_kbd_char(void) case KT_LATIN: if (isprint(keychar)) break; /* printable characters */ - /* fall through */ + fallthrough; case KT_SPEC: if (keychar == K_ENTER) break; - /* fall through */ + fallthrough; default: return -1; /* ignore unprintables */ } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 004c5b6c87f8..6226502ce049 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); @@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_getword: bad width %ld\n", (long) size); @@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) diag = kdb_putarea(addr, w8); break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_putword: bad width %ld\n", (long) size); diff --git a/kernel/events/core.c b/kernel/events/core.c index 5bfe8e3c6e44..7ed5248f0445 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10034,7 +10034,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; - /* fall through */ + fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a8e14c80b405..762a928e18f9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -173,7 +173,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags __irq_wake_thread(desc, action); - /* Fall through - to add to randomness */ + fallthrough; /* to add to randomness */ case IRQ_HANDLED: *flags |= action->flags; break; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 52ac5391dcc6..5df903fccb60 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -271,7 +271,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); - /* fall through */ + fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); @@ -868,7 +868,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); - /* fall through */ + fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 95cb74f73292..4fb15fa96734 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -684,12 +684,12 @@ bool kallsyms_show_value(const struct cred *cred) case 0: if (kallsyms_for_perf()) return true; - /* fallthrough */ + fallthrough; case 1: if (security_capable(cred, &init_user_ns, CAP_SYSLOG, CAP_OPT_NOAUDIT) == 0) return true; - /* fallthrough */ + fallthrough; default: return false; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f33769f97aca..e7aa57fb2fdc 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -659,7 +659,7 @@ static void power_down(void) break; case HIBERNATION_PLATFORM: hibernation_platform_enter(); - /* Fall through */ + fallthrough; case HIBERNATION_SHUTDOWN: if (pm_power_off) kernel_power_off(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index db0bed2cae26..ec7e1e85923e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -119,7 +119,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); - /* fall through */ + fallthrough; case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); @@ -188,7 +188,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); - /* fall through */ + fallthrough; case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8471a0f7eb32..2d95dc3f4644 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2320,7 +2320,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) state = possible; break; } - /* Fall-through */ + fallthrough; case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 007b0a6b0152..1bd7e3af904f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1219,13 +1219,13 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_rootdomain: if (!atomic_read(&d->rd->refcount)) free_rootdomain(&d->rd->rcu); - /* Fall through */ + fallthrough; case sa_sd: free_percpu(d->sd); - /* Fall through */ + fallthrough; case sa_sd_storage: __sdt_free(cpu_map); - /* Fall through */ + fallthrough; case sa_none: break; } diff --git a/kernel/signal.c b/kernel/signal.c index 42b67d2cea37..a38b3edc6851 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -851,7 +851,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; - /* fall through */ + fallthrough; default: return -EPERM; } diff --git a/kernel/sys.c b/kernel/sys.c index ca11af9d815d..ab6c409b1159 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1753,7 +1753,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; - /* fall through */ + fallthrough; case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c4038511d5c9..95b6a708b040 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -377,7 +377,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); - /* fall through */ + fallthrough; default: return false; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 07709ac30439..bf540f5a4115 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -439,12 +439,12 @@ static struct pid *good_sigevent(sigevent_t * event) rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; - /* FALLTHRU */ + fallthrough; case SIGEV_SIGNAL: case SIGEV_THREAD: if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) return NULL; - /* FALLTHRU */ + fallthrough; case SIGEV_NONE: return pid; default: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e51778c312f1..36d7464c8962 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -381,7 +381,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; - /* fall through */ + fallthrough; case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a16764b0116e..a50364df1054 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -666,7 +666,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: WARN_ON(1); - /* fall through */ + fallthrough; default: return false; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7ba62d68885a..4b3a42fc3b24 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -745,7 +745,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; - /* fall through */ + fallthrough; case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index bf44f6bbd0c3..78a678eeb140 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -499,7 +499,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, ptr++; break; } - /* fall through */ + fallthrough; default: parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); @@ -1273,7 +1273,7 @@ static int parse_pred(const char *str, void *data, switch (op) { case OP_NE: pred->not = 1; - /* Fall through */ + fallthrough; case OP_GLOB: case OP_EQ: break; -- cgit v1.2.3 From b474959d5afda6e341a02c85f9595d85d39189ae Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 12:10:54 -0700 Subject: bpf: Fix a buffer out-of-bound access when filling raw_tp link_info Commit f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") added link query for raw_tp. One of fields in link_info is to fill a user buffer with tp_name. The Scurrent checking only declares "ulen && !ubuf" as invalid. So "!ulen && ubuf" will be valid. Later on, we do "copy_to_user(ubuf, tp_name, ulen - 1)" which may overwrite user memory incorrectly. This patch fixed the problem by disallowing "!ulen && ubuf" case as well. Fixes: f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200821191054.714731-1-yhs@fb.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 86299a292214..ac6c784c0576 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2634,7 +2634,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, u32 ulen = info->raw_tracepoint.tp_name_len; size_t tp_len = strlen(tp_name); - if (ulen && !ubuf) + if (!ulen ^ !ubuf) return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; -- cgit v1.2.3 From 7787b6fc938e16aa418613c4a765c1dbb268ed9f Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 24 Aug 2020 16:20:47 +0200 Subject: bpf, sysctl: Let bpf_stats_handler take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of bpf_stats_handler to match ctl_table.proc_handler which fixes the following sparse warning: kernel/sysctl.c:226:49: warning: incorrect type in argument 3 (different address spaces) kernel/sysctl.c:226:49: expected void * kernel/sysctl.c:226:49: got void [noderef] __user *buffer kernel/sysctl.c:2640:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/sysctl.c:2640:35: expected int ( [usertype] *proc_handler )( ... ) kernel/sysctl.c:2640:35: got int ( * )( ... ) Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Alexei Starovoitov Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200824142047.22043-1-tklauser@distanz.ch --- kernel/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 287862f91717..09e70ee2332e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -204,8 +204,7 @@ static int max_extfrag_threshold = 1000; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static int saved_val; -- cgit v1.2.3 From fddf9055a60dfcc97bda5ef03c8fa4108ed555c5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 20 Aug 2020 09:13:30 +0200 Subject: lockdep: Use raw_cpu_*() for per-cpu variables Sven reported that commit a21ee6055c30 ("lockdep: Change hardirq{s_enabled,_context} to per-cpu variables") caused trouble on s390 because their this_cpu_*() primitives disable preemption which then lands back tracing. On the one hand, per-cpu ops should use preempt_*able_notrace() and raw_local_irq_*(), on the other hand, we can trivialy use raw_cpu_*() ops for this. Fixes: a21ee6055c30 ("lockdep: Change hardirq{s_enabled,_context} to per-cpu variables") Reported-by: Sven Schnelle Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200821085348.192346882@infradead.org --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2fad21d345b0..c872e95e6e4d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3756,7 +3756,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ - this_cpu_write(hardirqs_enabled, 1); + __this_cpu_write(hardirqs_enabled, 1); trace->hardirq_enable_ip = ip; trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); @@ -3795,7 +3795,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) /* * We have done an ON -> OFF transition: */ - this_cpu_write(hardirqs_enabled, 0); + __this_cpu_write(hardirqs_enabled, 0); trace->hardirq_disable_ip = ip; trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); -- cgit v1.2.3 From 1098582a0f6c4e8fd28da0a6305f9233d02c9c1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2020 20:50:19 +0200 Subject: sched,idle,rcu: Push rcu_idle deeper into the idle path Lots of things take locks, due to a wee bug, rcu_lockdep didn't notice that the locking tracepoints were using RCU. Push rcu_idle_{enter,exit}() as deep as possible into the idle paths, this also resolves a lot of _rcuidle()/RCU_NONIDLE() usage. Specifically, sched_clock_idle_wakeup_event() will use ktime which will use seqlocks which will tickle lockdep, and stop_critical_timings() uses lock. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Marco Elver Link: https://lkml.kernel.org/r/20200821085348.310943801@infradead.org --- kernel/sched/idle.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6bf34986f45c..ea3a0989dc4c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -54,17 +54,18 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { + trace_cpu_idle(0, smp_processor_id()); + stop_critical_timings(); rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - stop_critical_timings(); while (!tif_need_resched() && - (cpu_idle_force_poll || tick_check_broadcast_expired())) + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); - start_critical_timings(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + start_critical_timings(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); return 1; } @@ -91,7 +92,9 @@ void __cpuidle default_idle_call(void) local_irq_enable(); } else { stop_critical_timings(); + rcu_idle_enter(); arch_cpu_idle(); + rcu_idle_exit(); start_critical_timings(); } } @@ -158,7 +161,6 @@ static void cpuidle_idle_call(void) if (cpuidle_not_available(drv, dev)) { tick_nohz_idle_stop_tick(); - rcu_idle_enter(); default_idle_call(); goto exit_idle; @@ -178,21 +180,17 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { - rcu_idle_enter(); entered_state = call_cpuidle_s2idle(drv, dev); if (entered_state > 0) goto exit_idle; - rcu_idle_exit(); - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); - rcu_idle_enter(); next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); @@ -209,8 +207,6 @@ static void cpuidle_idle_call(void) else tick_nohz_idle_retain_tick(); - rcu_idle_enter(); - entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -226,8 +222,6 @@ exit_idle: */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); - - rcu_idle_exit(); } /* -- cgit v1.2.3 From 9864f5b5943ab0f1f835f21dc3f9f068d06f5b52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Aug 2020 12:27:10 +0200 Subject: cpuidle: Move trace_cpu_idle() into generic code Remove trace_cpu_idle() from the arch_cpu_idle() implementations and put it in the generic code, right before disabling RCU. Gets rid of more trace_*_rcuidle() users. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Marco Elver Link: https://lkml.kernel.org/r/20200821085348.428433395@infradead.org --- kernel/sched/idle.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ea3a0989dc4c..f324dc36fc43 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -91,11 +91,14 @@ void __cpuidle default_idle_call(void) if (current_clr_polling_and_test()) { local_irq_enable(); } else { + + trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); rcu_idle_exit(); start_critical_timings(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } } -- cgit v1.2.3 From eb1f00237aca2e368b93db79303f8433d1976d10 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2020 20:53:16 +0200 Subject: lockdep,trace: Expose tracepoints The lockdep tracepoints are under the lockdep recursion counter, this has a bunch of nasty side effects: - TRACE_IRQFLAGS doesn't work across the entire tracepoint - RCU-lockdep doesn't see the tracepoints either, hiding numerous "suspicious RCU usage" warnings. Pull the trace_lock_*() tracepoints completely out from under the lockdep recursion handling and completely rely on the trace level recusion handling -- also, tracing *SHOULD* not be taking locks in any case. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Marco Elver Link: https://lkml.kernel.org/r/20200821085348.782688941@infradead.org --- kernel/locking/lockdep.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c872e95e6e4d..54b74fabf40c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4977,6 +4977,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + if (unlikely(current->lockdep_recursion)) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { @@ -5001,7 +5003,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, check_flags(flags); current->lockdep_recursion++; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); lockdep_recursion_finish(); @@ -5013,13 +5014,15 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_release(lock, ip); + if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); + current->lockdep_recursion++; - trace_lock_release(lock, ip); if (__lock_release(lock, ip)) check_chain_key(current); lockdep_recursion_finish(); @@ -5205,8 +5208,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip); - stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -5225,6 +5226,8 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_acquired(lock, ip); + if (unlikely(!lock_stat || !debug_locks)) return; @@ -5234,7 +5237,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion++; - trace_lock_contended(lock, ip); __lock_contended(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5245,6 +5247,8 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_contended(lock, ip); + if (unlikely(!lock_stat || !debug_locks)) return; -- cgit v1.2.3 From 892fc9f6835ecf075efac20789b012c5c9997fcc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 26 Aug 2020 14:33:30 +0300 Subject: dma-pool: Fix an uninitialized variable bug in atomic_pool_expand() The "page" pointer can be used with out being initialized. Fixes: d7e673ec2c8e ("dma-pool: Only allocate from CMA when in same memory zone") Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 06582b488e31..1281c0f0442b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -84,7 +84,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, gfp_t gfp) { unsigned int order; - struct page *page; + struct page *page = NULL; void *addr; int ret = -ENOMEM; -- cgit v1.2.3 From 784a0830377d0761834e385975bc46861fea9fa0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 30 Aug 2020 19:07:53 +0200 Subject: genirq/matrix: Deal with the sillyness of for_each_cpu() on UP Most of the CPU mask operations behave the same way, but for_each_cpu() and it's variants ignore the cpumask argument and claim that CPU0 is always in the mask. This is historical, inconsistent and annoying behaviour. The matrix allocator uses for_each_cpu() and can be called on UP with an empty cpumask. The calling code does not expect that this succeeds but until commit e027fffff799 ("x86/irq: Unbreak interrupt affinity setting") this went unnoticed. That commit added a WARN_ON() to catch cases which move an interrupt from one vector to another on the same CPU. The warning triggers on UP. Add a check for the cpumask being empty to prevent this. Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/irq/matrix.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 30cc217b8631..651a4ad6d711 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -380,6 +380,13 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, unsigned int cpu, bit; struct cpumap *cm; + /* + * Not required in theory, but matrix_find_best_cpu() uses + * for_each_cpu() which ignores the cpumask on UP . + */ + if (cpumask_empty(msk)) + return -EINVAL; + cpu = matrix_find_best_cpu(m, msk); if (cpu == UINT_MAX) return -ENOSPC; -- cgit v1.2.3 From 23870f1227680d2aacff6f79c3ab2222bd04e86e Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Wed, 2 Sep 2020 18:03:23 +0200 Subject: locking/lockdep: Fix "USED" <- "IN-NMI" inversions During the LPC RCU BoF Paul asked how come the "USED" <- "IN-NMI" detector doesn't trip over rcu_read_lock()'s lockdep annotation. Looking into this I found a very embarrasing typo in verify_lock_unused(): - if (!(class->usage_mask & LOCK_USED)) + if (!(class->usage_mask & LOCKF_USED)) fixing that will indeed cause rcu_read_lock() to insta-splat :/ The above typo means that instead of testing for: 0x100 (1 << LOCK_USED), we test for 8 (LOCK_USED), which corresponds to (1 << LOCK_ENABLED_HARDIRQ). So instead of testing for _any_ used lock, it will only match any lock used with interrupts enabled. The rcu_read_lock() annotation uses .check=0, which means it will not set any of the interrupt bits and will thus never match. In order to properly fix the situation and allow rcu_read_lock() to correctly work, split LOCK_USED into LOCK_USED and LOCK_USED_READ and by having .read users set USED_READ and test USED, pure read-recursive locks are permitted. Fixes: f6f48e180404 ("lockdep: Teach lockdep about "USED" <- "IN-NMI" inversions") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Masami Hiramatsu Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20200902160323.GK1362448@hirez.programming.kicks-ass.net --- kernel/locking/lockdep.c | 35 +++++++++++++++++++++++++++++------ kernel/locking/lockdep_internals.h | 2 ++ 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 54b74fabf40c..2facbbd146ec 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3969,13 +3969,18 @@ static int separate_irq_context(struct task_struct *curr, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - unsigned int new_mask = 1 << new_bit, ret = 1; + unsigned int old_mask, new_mask, ret = 1; if (new_bit >= LOCK_USAGE_STATES) { DEBUG_LOCKS_WARN_ON(1); return 0; } + if (new_bit == LOCK_USED && this->read) + new_bit = LOCK_USED_READ; + + new_mask = 1 << new_bit; + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3988,13 +3993,22 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didn't race: */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } + if (unlikely(hlock_class(this)->usage_mask & new_mask)) + goto unlock; + old_mask = hlock_class(this)->usage_mask; hlock_class(this)->usage_mask |= new_mask; + /* + * Save one usage_traces[] entry and map both LOCK_USED and + * LOCK_USED_READ onto the same entry. + */ + if (new_bit == LOCK_USED || new_bit == LOCK_USED_READ) { + if (old_mask & (LOCKF_USED | LOCKF_USED_READ)) + goto unlock; + new_bit = LOCK_USED; + } + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) return 0; @@ -4008,6 +4022,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; } +unlock: graph_unlock(); /* @@ -4942,12 +4957,20 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock { #ifdef CONFIG_PROVE_LOCKING struct lock_class *class = look_up_lock_class(lock, subclass); + unsigned long mask = LOCKF_USED; /* if it doesn't have a class (yet), it certainly hasn't been used yet */ if (!class) return; - if (!(class->usage_mask & LOCK_USED)) + /* + * READ locks only conflict with USED, such that if we only ever use + * READ locks, there is no deadlock possible -- RCU. + */ + if (!hlock->read) + mask |= LOCKF_USED_READ; + + if (!(class->usage_mask & mask)) return; hlock->class_idx = class - lock_classes; diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index baca699b94e9..b0be1560ed17 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -19,6 +19,7 @@ enum lock_usage_bit { #include "lockdep_states.h" #undef LOCKDEP_STATE LOCK_USED, + LOCK_USED_READ, LOCK_USAGE_STATES }; @@ -40,6 +41,7 @@ enum { #include "lockdep_states.h" #undef LOCKDEP_STATE __LOCKF(USED) + __LOCKF(USED_READ) }; #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | -- cgit v1.2.3 From dc0988bbe1bd41e2fa555e4a6f890b819a34b49b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 2 Sep 2020 16:53:40 -0700 Subject: bpf: Do not use bucket_lock for hashmap iterator Currently, for hashmap, the bpf iterator will grab a bucket lock, a spinlock, before traversing the elements in the bucket. This can ensure all bpf visted elements are valid. But this mechanism may cause deadlock if update/deletion happens to the same bucket of the visited map in the program. For example, if we added bpf_map_update_elem() call to the same visited element in selftests bpf_iter_bpf_hash_map.c, we will have the following deadlock: ============================================ WARNING: possible recursive locking detected 5.9.0-rc1+ #841 Not tainted -------------------------------------------- test_progs/1750 is trying to acquire lock: ffff9a5bb73c5e70 (&htab->buckets[i].raw_lock){....}-{2:2}, at: htab_map_update_elem+0x1cf/0x410 but task is already holding lock: ffff9a5bb73c5e20 (&htab->buckets[i].raw_lock){....}-{2:2}, at: bpf_hash_map_seq_find_next+0x94/0x120 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&htab->buckets[i].raw_lock); lock(&htab->buckets[i].raw_lock); *** DEADLOCK *** ... Call Trace: dump_stack+0x78/0xa0 __lock_acquire.cold.74+0x209/0x2e3 lock_acquire+0xba/0x380 ? htab_map_update_elem+0x1cf/0x410 ? __lock_acquire+0x639/0x20c0 _raw_spin_lock_irqsave+0x3b/0x80 ? htab_map_update_elem+0x1cf/0x410 htab_map_update_elem+0x1cf/0x410 ? lock_acquire+0xba/0x380 bpf_prog_ad6dab10433b135d_dump_bpf_hash_map+0x88/0xa9c ? find_held_lock+0x34/0xa0 bpf_iter_run_prog+0x81/0x16e __bpf_hash_map_seq_show+0x145/0x180 bpf_seq_read+0xff/0x3d0 vfs_read+0xad/0x1c0 ksys_read+0x5f/0xe0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ... The bucket_lock first grabbed in seq_ops->next() called by bpf_seq_read(), and then grabbed again in htab_map_update_elem() in the bpf program, causing deadlocks. Actually, we do not need bucket_lock here, we can just use rcu_read_lock() similar to netlink iterator where the rcu_read_{lock,unlock} likes below: seq_ops->start(): rcu_read_lock(); seq_ops->next(): rcu_read_unlock(); /* next element */ rcu_read_lock(); seq_ops->stop(); rcu_read_unlock(); Compared to old bucket_lock mechanism, if concurrent updata/delete happens, we may visit stale elements, miss some elements, or repeat some elements. I think this is a reasonable compromise. For users wanting to avoid stale, missing/repeated accesses, bpf_map batch access syscall interface can be used. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200902235340.2001375-1-yhs@fb.com --- kernel/bpf/hashtab.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 78dfff6a501b..7df28a45c66b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1622,7 +1622,6 @@ struct bpf_iter_seq_hash_map_info { struct bpf_map *map; struct bpf_htab *htab; void *percpu_value_buf; // non-zero means percpu hash - unsigned long flags; u32 bucket_id; u32 skip_elems; }; @@ -1632,7 +1631,6 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, struct htab_elem *prev_elem) { const struct bpf_htab *htab = info->htab; - unsigned long flags = info->flags; u32 skip_elems = info->skip_elems; u32 bucket_id = info->bucket_id; struct hlist_nulls_head *head; @@ -1656,19 +1654,18 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, /* not found, unlock and go to the next bucket */ b = &htab->buckets[bucket_id++]; - htab_unlock_bucket(htab, b, flags); + rcu_read_unlock(); skip_elems = 0; } for (i = bucket_id; i < htab->n_buckets; i++) { b = &htab->buckets[i]; - flags = htab_lock_bucket(htab, b); + rcu_read_lock(); count = 0; head = &b->head; hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { if (count >= skip_elems) { - info->flags = flags; info->bucket_id = i; info->skip_elems = count; return elem; @@ -1676,7 +1673,7 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, count++; } - htab_unlock_bucket(htab, b, flags); + rcu_read_unlock(); skip_elems = 0; } @@ -1754,14 +1751,10 @@ static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) { - struct bpf_iter_seq_hash_map_info *info = seq->private; - if (!v) (void)__bpf_hash_map_seq_show(seq, NULL); else - htab_unlock_bucket(info->htab, - &info->htab->buckets[info->bucket_id], - info->flags); + rcu_read_unlock(); } static int bpf_iter_init_hash_map(void *priv_data, -- cgit v1.2.3 From 1b0df11fde0f14a269a181b3b7f5122415bc5ed7 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 2 Sep 2020 13:07:56 -0400 Subject: padata: fix possible padata_works_lock deadlock syzbot reports, WARNING: inconsistent lock state 5.9.0-rc2-syzkaller #0 Not tainted -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. syz-executor.0/26715 takes: (padata_works_lock){+.?.}-{2:2}, at: padata_do_parallel kernel/padata.c:220 {IN-SOFTIRQ-W} state was registered at: spin_lock include/linux/spinlock.h:354 [inline] padata_do_parallel kernel/padata.c:220 ... __do_softirq kernel/softirq.c:298 ... sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1091 asm_sysvec_apic_timer_interrupt arch/x86/include/asm/idtentry.h:581 Possible unsafe locking scenario: CPU0 ---- lock(padata_works_lock); lock(padata_works_lock); padata_do_parallel() takes padata_works_lock with softirqs enabled, so a deadlock is possible if, on the same CPU, the lock is acquired in process context and then softirq handling done in an interrupt leads to the same path. Fix by leaving softirqs disabled while do_parallel holds padata_works_lock. Reported-by: syzbot+f4b9f49e38e25eb4ef52@syzkaller.appspotmail.com Fixes: 4611ce2246889 ("padata: allocate work structures for parallel jobs from a pool") Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 16cb894dc272..d4d3ba6e1728 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -215,12 +215,13 @@ int padata_do_parallel(struct padata_shell *ps, padata->pd = pd; padata->cb_cpu = *cb_cpu; - rcu_read_unlock_bh(); - spin_lock(&padata_works_lock); padata->seq_nr = ++pd->seq_nr; pw = padata_work_alloc(); spin_unlock(&padata_works_lock); + + rcu_read_unlock_bh(); + if (pw) { padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); -- cgit v1.2.3 From 4facb95b7adaf77e2da73aafb9ba60996fe42a12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Sep 2020 01:50:54 +0200 Subject: x86/entry: Unbreak 32bit fast syscall Andy reported that the syscall treacing for 32bit fast syscall fails: # ./tools/testing/selftests/x86/ptrace_syscall_32 ... [RUN] SYSEMU [FAIL] Initial args are wrong (nr=224, args=10 11 12 13 14 4289172732) ... [RUN] SYSCALL [FAIL] Initial args are wrong (nr=29, args=0 0 0 0 0 4289172732) The eason is that the conversion to generic entry code moved the retrieval of the sixth argument (EBP) after the point where the syscall entry work runs, i.e. ptrace, seccomp, audit... Unbreak it by providing a split up version of syscall_enter_from_user_mode(). - syscall_enter_from_user_mode_prepare() establishes state and enables interrupts - syscall_enter_from_user_mode_work() runs the entry work Replace the call to syscall_enter_from_user_mode() in the 32bit fast syscall C-entry with the split functions and stick the EBP retrieval between them. Fixes: 27d6b4d14f5c ("x86/entry: Use generic syscall entry function") Reported-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87k0xdjbtt.fsf@nanos.tec.linutronix.de --- kernel/entry/common.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index fcae019158ca..18683598edbc 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -69,22 +69,45 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall_get_nr(current, regs); } -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +static __always_inline long +__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { unsigned long ti_work; - enter_from_user_mode(regs); - instrumentation_begin(); - - local_irq_enable(); ti_work = READ_ONCE(current_thread_info()->flags); if (ti_work & SYSCALL_ENTER_WORK) syscall = syscall_trace_enter(regs, syscall, ti_work); - instrumentation_end(); return syscall; } +long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) +{ + return __syscall_enter_from_user_work(regs, syscall); +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + long ret; + + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + ret = __syscall_enter_from_user_work(regs, syscall); + instrumentation_end(); + + return ret; +} + +noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + instrumentation_begin(); + local_irq_enable(); + instrumentation_end(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * -- cgit v1.2.3 From cfc905f158eaa099d6258031614d11869e7ef71c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 4 Sep 2020 18:58:08 +0300 Subject: gcov: Disable gcov build with GCC 10 GCOV built with GCC 10 doesn't initialize n_function variable. This produces different kernel panics as was seen by Colin in Ubuntu and me in FC 32. As a workaround, let's disable GCOV build for broken GCC 10 version. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1891288 Link: https://lore.kernel.org/lkml/20200827133932.3338519-1-leon@kernel.org Link: https://lore.kernel.org/lkml/CAHk-=whbijeSdSvx-Xcr0DPMj0BiwhJ+uiNnDSVZcr_h_kg7UA@mail.gmail.com/ Cc: Colin Ian King Signed-off-by: Leon Romanovsky Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3110c77230c7..bb4b680e8455 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,6 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS + depends on !CC_IS_GCC || GCC_VERSION < 100000 select CONSTRUCTORS if !UML default n help -- cgit v1.2.3 From b0daa2c73f3a8ab9183a9d298495b583b6c1bd19 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 4 Sep 2020 16:35:49 -0700 Subject: fork: adjust sysctl_max_threads definition to match prototype Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the definition of sysctl_max_threads to match its prototype in linux/sysctl.h which fixes the following sparse error/warning: kernel/fork.c:3050:47: warning: incorrect type in argument 3 (different address spaces) kernel/fork.c:3050:47: expected void * kernel/fork.c:3050:47: got void [noderef] __user *buffer kernel/fork.c:3036:5: error: symbol 'sysctl_max_threads' redeclared with different type (incompatible argument 3 (different address spaces)): kernel/fork.c:3036:5: int extern [addressable] [signed] [toplevel] sysctl_max_threads( ... ) kernel/fork.c: note: in included file (through include/linux/key.h, include/linux/cred.h, include/linux/sched/signal.h, include/linux/sched/cputime.h): include/linux/sysctl.h:242:5: note: previously declared as: include/linux/sysctl.h:242:5: int extern [addressable] [signed] [toplevel] sysctl_max_threads( ... ) Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Al Viro Link: https://lkml.kernel.org/r/20200825093647.24263-1-tklauser@distanz.ch Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4d32190861bd..49677d668de4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -3014,7 +3014,7 @@ int unshare_files(struct files_struct **displaced) } int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int ret; -- cgit v1.2.3 From a566a9012acd7c9a4be7e30dc7acb7a811ec2260 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Tue, 1 Sep 2020 19:40:16 -0600 Subject: seccomp: don't leak memory when filter install races In seccomp_set_mode_filter() with TSYNC | NEW_LISTENER, we first initialize the listener fd, then check to see if we can actually use it later in seccomp_may_assign_mode(), which can fail if anyone else in our thread group has installed a filter and caused some divergence. If we can't, we partially clean up the newly allocated file: we put the fd, put the file, but don't actually clean up the *memory* that was allocated at filter->notif. Let's clean that up too. To accomplish this, let's hoist the actual "detach a notifier from a filter" code to its own helper out of seccomp_notify_release(), so that in case anyone adds stuff to init_listener(), they only have to add the cleanup code in one spot. This does a bit of extra locking and such on the failure path when the filter is not attached, but it's a slow failure path anyway. Fixes: 51891498f2da ("seccomp: allow TSYNC and USER_NOTIF together") Reported-by: syzbot+3ad9614a12f80994c32e@syzkaller.appspotmail.com Signed-off-by: Tycho Andersen Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200902014017.934315-1-tycho@tycho.pizza Signed-off-by: Kees Cook --- kernel/seccomp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3ee59ce0a323..bb0dd9ae699a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1109,13 +1109,12 @@ out: } #ifdef CONFIG_SECCOMP_FILTER -static int seccomp_notify_release(struct inode *inode, struct file *file) +static void seccomp_notify_detach(struct seccomp_filter *filter) { - struct seccomp_filter *filter = file->private_data; struct seccomp_knotif *knotif; if (!filter) - return 0; + return; mutex_lock(&filter->notify_lock); @@ -1142,6 +1141,13 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) kfree(filter->notif); filter->notif = NULL; mutex_unlock(&filter->notify_lock); +} + +static int seccomp_notify_release(struct inode *inode, struct file *file) +{ + struct seccomp_filter *filter = file->private_data; + + seccomp_notify_detach(filter); __put_seccomp_filter(filter); return 0; } @@ -1581,6 +1587,7 @@ out_put_fd: listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); + seccomp_notify_detach(prepared); } else { fd_install(listener, listener_f); ret = listener; -- cgit v1.2.3 From e839317900e9f13c83d8711d684de88c625b307a Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 2 Sep 2020 08:09:53 -0600 Subject: seccomp: don't leave dangling ->notif if file allocation fails Christian and Kees both pointed out that this is a bit sloppy to open-code both places, and Christian points out that we leave a dangling pointer to ->notif if file allocation fails. Since we check ->notif for null in order to determine if it's ok to install a filter, this means people won't be able to install a filter if the file allocation fails for some reason, even if they subsequently should be able to. To fix this, let's hoist this free+null into its own little helper and use it. Reported-by: Kees Cook Reported-by: Christian Brauner Signed-off-by: Tycho Andersen Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200902140953.1201956-1-tycho@tycho.pizza Signed-off-by: Kees Cook --- kernel/seccomp.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index bb0dd9ae699a..676d4af62103 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1109,6 +1109,12 @@ out: } #ifdef CONFIG_SECCOMP_FILTER +static void seccomp_notify_free(struct seccomp_filter *filter) +{ + kfree(filter->notif); + filter->notif = NULL; +} + static void seccomp_notify_detach(struct seccomp_filter *filter) { struct seccomp_knotif *knotif; @@ -1138,8 +1144,7 @@ static void seccomp_notify_detach(struct seccomp_filter *filter) complete(&knotif->ready); } - kfree(filter->notif); - filter->notif = NULL; + seccomp_notify_free(filter); mutex_unlock(&filter->notify_lock); } @@ -1494,7 +1499,7 @@ static struct file *init_listener(struct seccomp_filter *filter) out_notif: if (IS_ERR(ret)) - kfree(filter->notif); + seccomp_notify_free(filter); out: return ret; } -- cgit v1.2.3 From 40249c6962075c040fd071339acae524f18bfac9 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Thu, 10 Sep 2020 14:52:01 +0200 Subject: gcov: add support for GCC 10.1 Using gcov to collect coverage data for kernels compiled with GCC 10.1 causes random malfunctions and kernel crashes. This is the result of a changed GCOV_COUNTERS value in GCC 10.1 that causes a mismatch between the layout of the gcov_info structure created by GCC profiling code and the related structure used by the kernel. Fix this by updating the in-kernel GCOV_COUNTERS value. Also re-enable config GCOV_KERNEL for use with GCC 10. Reported-by: Colin Ian King Reported-by: Leon Romanovsky Signed-off-by: Peter Oberparleiter Tested-by: Leon Romanovsky Tested-and-Acked-by: Colin Ian King Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 1 - kernel/gcov/gcc_4_7.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index bb4b680e8455..3110c77230c7 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - depends on !CC_IS_GCC || GCC_VERSION < 100000 select CONSTRUCTORS if !UML default n help diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 908fdf5098c3..53c67c87f141 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -19,7 +19,9 @@ #include #include "gcov.h" -#if (__GNUC__ >= 7) +#if (__GNUC__ >= 10) +#define GCOV_COUNTERS 8 +#elif (__GNUC__ >= 7) #define GCOV_COUNTERS 9 #elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 -- cgit v1.2.3 From 73ac74c7d489756d2313219a108809921dbfaea1 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 10 Sep 2020 12:24:53 +0200 Subject: lockdep: fix order in trace_hardirqs_off_caller() Switch order so that locking state is consistent even if the IRQ tracer calls into lockdep again. Acked-by: Peter Zijlstra Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- kernel/trace/trace_preemptirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index f10073e62603..f4938040c228 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -102,14 +102,14 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + lockdep_hardirqs_off(CALLER_ADDR0); + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); } - - lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); NOKPROBE_SYMBOL(trace_hardirqs_off_caller); -- cgit v1.2.3 From b6ec413461034d49f9e586845825adb35ba308f6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Sep 2020 17:58:26 -0700 Subject: core/entry: Report syscall correctly for trace and audit On v5.8 when doing seccomp syscall rewrites (e.g. getpid into getppid as seen in the seccomp selftests), trace (and audit) correctly see the rewritten syscall on entry and exit: seccomp_bpf-1307 [000] .... 22974.874393: sys_enter: NR 110 (... seccomp_bpf-1307 [000] .N.. 22974.874401: sys_exit: NR 110 = 1304 With mainline we see a mismatched enter and exit (the original syscall is incorrectly visible on entry): seccomp_bpf-1030 [000] .... 21.806766: sys_enter: NR 39 (... seccomp_bpf-1030 [000] .... 21.806767: sys_exit: NR 110 = 1027 When ptrace or seccomp change the syscall, this needs to be visible to trace and audit at that time as well. Update the syscall earlier so they see the correct value. Fixes: d88d59b64ca3 ("core/entry: Respect syscall number rewrites") Reported-by: Michael Ellerman Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200912005826.586171-1-keescook@chromium.org --- kernel/entry/common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 18683598edbc..6fdb6105e6d6 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -60,13 +60,15 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret; } + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall); syscall_enter_audit(regs, syscall); - /* The above might have changed the syscall number */ - return ret ? : syscall_get_nr(current, regs); + return ret ? : syscall; } static __always_inline long -- cgit v1.2.3 From ce880cb825fcc22d4e39046a6c3a3a7f6603883d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 15 Sep 2020 17:44:01 -0700 Subject: bpf: Fix a rcu warning for bpffs map pretty-print Running selftest ./btf_btf -p the kernel had the following warning: [ 51.528185] WARNING: CPU: 3 PID: 1756 at kernel/bpf/hashtab.c:717 htab_map_get_next_key+0x2eb/0x300 [ 51.529217] Modules linked in: [ 51.529583] CPU: 3 PID: 1756 Comm: test_btf Not tainted 5.9.0-rc1+ #878 [ 51.530346] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014 [ 51.531410] RIP: 0010:htab_map_get_next_key+0x2eb/0x300 ... [ 51.542826] Call Trace: [ 51.543119] map_seq_next+0x53/0x80 [ 51.543528] seq_read+0x263/0x400 [ 51.543932] vfs_read+0xad/0x1c0 [ 51.544311] ksys_read+0x5f/0xe0 [ 51.544689] do_syscall_64+0x33/0x40 [ 51.545116] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The related source code in kernel/bpf/hashtab.c: 709 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 710 { 711 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 712 struct hlist_nulls_head *head; 713 struct htab_elem *l, *next_l; 714 u32 hash, key_size; 715 int i = 0; 716 717 WARN_ON_ONCE(!rcu_read_lock_held()); In kernel/bpf/inode.c, bpffs map pretty print calls map->ops->map_get_next_key() without holding a rcu_read_lock(), hence causing the above warning. To fix the issue, just surrounding map->ops->map_get_next_key() with rcu read lock. Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") Reported-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200916004401.146277-1-yhs@fb.com --- kernel/bpf/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fb878ba3f22f..18f4969552ac 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -226,10 +226,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) else prev_key = key; + rcu_read_lock(); if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; - return NULL; + key = NULL; } + rcu_read_unlock(); return key; } -- cgit v1.2.3 From e6b1a44eccfcab5e5e280be376f65478c3b2c7a2 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 15 Sep 2020 22:07:50 +0800 Subject: locking/percpu-rwsem: Use this_cpu_{inc,dec}() for read_count The __this_cpu*() accessors are (in general) IRQ-unsafe which, given that percpu-rwsem is a blocking primitive, should be just fine. However, file_end_write() is used from IRQ context and will cause load-store issues on architectures where the per-cpu accessors are not natively irq-safe. Fix it by using the IRQ-safe this_cpu_*() for operations on read_count. This will generate more expensive code on a number of platforms, which might cause a performance regression for some of the other percpu-rwsem users. If any such is reported, we can consider alternative solutions. Fixes: 70fe2f48152e ("aio: fix freeze protection of aio writes") Signed-off-by: Hou Tao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Acked-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20200915140750.137881-1-houtao1@huawei.com --- kernel/locking/percpu-rwsem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 8bbafe3e5203..70a32a576f3f 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem); static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); /* * Due to having preemption disabled the decrement happens on @@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(!atomic_read_acquire(&sem->block))) return true; - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); /* Prod writer to re-evaluate readers_active_check() */ rcuwait_wake_up(&sem->writer); -- cgit v1.2.3 From 78edc005f477a4987ee0a5d96bfe117295c231fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Aug 2020 08:09:40 -0700 Subject: rcu-tasks: Prevent complaints of unused show_rcu_tasks_classic_gp_kthread() Commit 8344496e8b49 ("rcu-tasks: Conditionally compile show_rcu_tasks_gp_kthreads()") introduced conditional compilation of several functions, but forgot one occurrence of show_rcu_tasks_classic_gp_kthread() that causes the compiler to warn of an unused static function. This commit uses "static inline" to avoid these complaints and possibly also to avoid emitting an actual definition of this function. Fixes: 8344496e8b49 ("rcu-tasks: Conditionally compile show_rcu_tasks_gp_kthreads()") Cc: # 5.8.x Reported-by: Laurent Pinchart Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 835e2df8590a..05d3e1375e4c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -590,7 +590,7 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) } #else /* #ifdef CONFIG_TASKS_RCU */ -static void show_rcu_tasks_classic_gp_kthread(void) { } +static inline void show_rcu_tasks_classic_gp_kthread(void) { } void exit_tasks_rcu_start(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ -- cgit v1.2.3 From 5ef64cc8987a9211d3f3667331ba3411a94ddc79 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Sep 2020 14:05:35 -0700 Subject: mm: allow a controlled amount of unfairness in the page lock Commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") made the page locking entirely fair, in that if a waiter came in while the lock was held, the lock would be transferred to the lockers strictly in order. That was intended to finally get rid of the long-reported watchdog failures that involved the page lock under extreme load, where a process could end up waiting essentially forever, as other page lockers stole the lock from under it. It also improved some benchmarks, but it ended up causing huge performance regressions on others, simply because fair lock behavior doesn't end up giving out the lock as aggressively, causing better worst-case latency, but potentially much worse average latencies and throughput. Instead of reverting that change entirely, this introduces a controlled amount of unfairness, with a sysctl knob to tune it if somebody needs to. But the default value should hopefully be good for any normal load, allowing a few rounds of lock stealing, but enforcing the strict ordering before the lock has been stolen too many times. There is also a hint from Matthieu Baerts that the fair page coloring may end up exposing an ABBA deadlock that is hidden by the usual optimistic lock stealing, and while the unfairness doesn't fix the fundamental issue (and I'm still looking at that), it avoids it in practice. The amount of unfairness can be modified by writing a new value to the 'sysctl_page_lock_unfairness' variable (default value of 5, exposed through /proc/sys/vm/page_lock_unfairness), but that is hopefully something we'd use mainly for debugging rather than being necessary for any deep system tuning. This whole issue has exposed just how critical the page lock can be, and how contended it gets under certain locks. And the main contention doesn't really seem to be anything related to IO (which was the origin of this lock), but for things like just verifying that the page file mapping is stable while faulting in the page into a page table. Link: https://lore.kernel.org/linux-fsdevel/ed8442fd-6f54-dd84-cd4a-941e8b7ee603@MichaelLarabel.com/ Link: https://www.phoronix.com/scan.php?page=article&item=linux-50-59&num=1 Link: https://lore.kernel.org/linux-fsdevel/c560a38d-8313-51fb-b1ec-e904bd8836bc@tessares.net/ Reported-and-tested-by: Michael Larabel Tested-by: Matthieu Baerts Cc: Dave Chinner Cc: Matthew Wilcox Cc: Chris Mason Cc: Jan Kara Cc: Amir Goldstein Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09e70ee2332e..afad085960b8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2912,6 +2912,14 @@ static struct ctl_table vm_table[] = { .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = SYSCTL_ZERO, }, + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #ifdef CONFIG_MMU { .procname = "max_map_count", -- cgit v1.2.3 From 3031313eb3d549b7ad6f9fbcc52ba04412e3eb9e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 1 Sep 2020 00:12:07 +0900 Subject: kprobes: Fix to check probe enabled before disarm_kprobe_ftrace() Commit 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") fixed one bug but not completely fixed yet. If we run a kprobe_module.tc of ftracetest, kernel showed a warning as below. # ./ftracetest test.d/kprobe/kprobe_module.tc === Ftrace unit tests === [1] Kprobe dynamic event - probing module ... [ 22.400215] ------------[ cut here ]------------ [ 22.400962] Failed to disarm kprobe-ftrace at trace_printk_irq_work+0x0/0x7e [trace_printk] (-2) [ 22.402139] WARNING: CPU: 7 PID: 200 at kernel/kprobes.c:1091 __disarm_kprobe_ftrace.isra.0+0x7e/0xa0 [ 22.403358] Modules linked in: trace_printk(-) [ 22.404028] CPU: 7 PID: 200 Comm: rmmod Not tainted 5.9.0-rc2+ #66 [ 22.404870] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 22.406139] RIP: 0010:__disarm_kprobe_ftrace.isra.0+0x7e/0xa0 [ 22.406947] Code: 30 8b 03 eb c9 80 3d e5 09 1f 01 00 75 dc 49 8b 34 24 89 c2 48 c7 c7 a0 c2 05 82 89 45 e4 c6 05 cc 09 1f 01 01 e8 a9 c7 f0 ff <0f> 0b 8b 45 e4 eb b9 89 c6 48 c7 c7 70 c2 05 82 89 45 e4 e8 91 c7 [ 22.409544] RSP: 0018:ffffc90000237df0 EFLAGS: 00010286 [ 22.410385] RAX: 0000000000000000 RBX: ffffffff83066024 RCX: 0000000000000000 [ 22.411434] RDX: 0000000000000001 RSI: ffffffff810de8d3 RDI: ffffffff810de8d3 [ 22.412687] RBP: ffffc90000237e10 R08: 0000000000000001 R09: 0000000000000001 [ 22.413762] R10: 0000000000000000 R11: 0000000000000001 R12: ffff88807c478640 [ 22.414852] R13: ffffffff8235ebc0 R14: ffffffffa00060c0 R15: 0000000000000000 [ 22.415941] FS: 00000000019d48c0(0000) GS:ffff88807d7c0000(0000) knlGS:0000000000000000 [ 22.417264] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 22.418176] CR2: 00000000005bb7e3 CR3: 0000000078f7a000 CR4: 00000000000006a0 [ 22.419309] Call Trace: [ 22.419990] kill_kprobe+0x94/0x160 [ 22.420652] kprobes_module_callback+0x64/0x230 [ 22.421470] notifier_call_chain+0x4f/0x70 [ 22.422184] blocking_notifier_call_chain+0x49/0x70 [ 22.422979] __x64_sys_delete_module+0x1ac/0x240 [ 22.423733] do_syscall_64+0x38/0x50 [ 22.424366] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 22.425176] RIP: 0033:0x4bb81d [ 22.425741] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e0 ff ff ff f7 d8 64 89 01 48 [ 22.428726] RSP: 002b:00007ffc70fef008 EFLAGS: 00000246 ORIG_RAX: 00000000000000b0 [ 22.430169] RAX: ffffffffffffffda RBX: 00000000019d48a0 RCX: 00000000004bb81d [ 22.431375] RDX: 0000000000000000 RSI: 0000000000000880 RDI: 00007ffc70fef028 [ 22.432543] RBP: 0000000000000880 R08: 00000000ffffffff R09: 00007ffc70fef320 [ 22.433692] R10: 0000000000656300 R11: 0000000000000246 R12: 00007ffc70fef028 [ 22.434635] R13: 0000000000000000 R14: 0000000000000002 R15: 0000000000000000 [ 22.435682] irq event stamp: 1169 [ 22.436240] hardirqs last enabled at (1179): [] console_unlock+0x422/0x580 [ 22.437466] hardirqs last disabled at (1188): [] console_unlock+0x7b/0x580 [ 22.438608] softirqs last enabled at (866): [] __do_softirq+0x38e/0x490 [ 22.439637] softirqs last disabled at (859): [] asm_call_on_stack+0x12/0x20 [ 22.440690] ---[ end trace 1e7ce7e1e4567276 ]--- [ 22.472832] trace_kprobe: This probe might be able to register after target module is loaded. Continue. This is because the kill_kprobe() calls disarm_kprobe_ftrace() even if the given probe is not enabled. In that case, ftrace_set_filter_ip() fails because the given probe point is not registered to ftrace. Fix to check the given (going) probe is enabled before invoking disarm_kprobe_ftrace(). Link: https://lkml.kernel.org/r/159888672694.1411785.5987998076694782591.stgit@devnote2 Fixes: 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") Cc: Ingo Molnar Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: David Miller Cc: Muchun Song Cc: Chengming Zhou Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 287b263c9cb9..d43b48ecdb4f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2159,9 +2159,10 @@ static void kill_kprobe(struct kprobe *p) /* * The module is going away. We should disarm the kprobe which - * is using ftrace. + * is using ftrace, because ftrace framework is still available at + * MODULE_STATE_GOING notification. */ - if (kprobe_ftrace(p)) + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); } -- cgit v1.2.3 From d5e47505e02666d5b3fe99212e0100791847dc8d Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Mon, 31 Aug 2020 14:26:31 +0200 Subject: ftrace: Free the trampoline when ftrace_startup() fails Commit fc0ea795f53c ("ftrace: Add symbols for ftrace trampolines") missed to remove ops from new ftrace_ops_trampoline_list in ftrace_startup() if ftrace_hash_ipmodify_enable() fails there. It may lead to BUG if such ops come from a module which may be removed. Moreover, the trampoline itself is not freed in this case. Fix it by calling ftrace_trampoline_free() during the rollback. Link: https://lkml.kernel.org/r/20200831122631.28057-1-mbenes@suse.cz Fixes: fc0ea795f53c ("ftrace: Add symbols for ftrace trampolines") Fixes: f8b8be8a310a ("ftrace, kprobes: Support IPMODIFY flag to find IP modify conflict") Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 275441254bb5..656d7cb5a78c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2862,6 +2862,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command) __unregister_ftrace_function(ops); ftrace_start_up--; ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + ftrace_trampoline_free(ops); return ret; } -- cgit v1.2.3 From 478ece9573f04aed834e05e0bbf034320d240f9f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 1 Sep 2020 12:16:17 +0300 Subject: ftrace: Fix missing synchronize_rcu() removing trampoline from kallsyms Add synchronize_rcu() after list_del_rcu() in ftrace_remove_trampoline_from_kallsyms() to protect readers of ftrace_ops_trampoline_list (in ftrace_get_trampoline_kallsym) which is used when kallsyms is read. Link: https://lkml.kernel.org/r/20200901091617.31837-1-adrian.hunter@intel.com Fixes: fc0ea795f53c8d ("ftrace: Add symbols for ftrace trampolines") Signed-off-by: Adrian Hunter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 656d7cb5a78c..e9f893388245 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2782,6 +2782,7 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) { lockdep_assert_held(&ftrace_lock); list_del_rcu(&ops->list); + synchronize_rcu(); } /* -- cgit v1.2.3 From 795d6379a47bcbb88bd95a69920e4acc52849f88 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Sep 2020 10:23:31 +0200 Subject: tracing: Make the space reserved for the pid wider For 64bit CONFIG_BASE_SMALL=0 systems PID_MAX_LIMIT is set by default to 4194304. During boot the kernel sets a new value based on number of CPUs but no lower than 32768. It is 1024 per CPU so with 128 CPUs the default becomes 131072 which needs six digits. This value can be increased during run time but must not exceed the initial upper limit. Systemd sometime after v241 sets it to the upper limit during boot. The result is that when the pid exceeds five digits, the trace output is a little hard to read because it is no longer properly padded (same like on big iron with 98+ CPUs). Increase the pid padding to seven digits. Link: https://lkml.kernel.org/r/20200904082331.dcdkrr3bkn3e4qlg@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 38 +++++++++++++++++++------------------- kernel/trace/trace_output.c | 12 ++++++------ 2 files changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f40d850ebabc..2a7c26345e83 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3782,14 +3782,14 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / delay \n" - "# cmd pid ||||| time | caller \n" - "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off \n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / delay \n" + "# cmd pid ||||| time | caller \n" + "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -3810,26 +3810,26 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, print_event_info(buf, m); - seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); - seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char *space = " "; - int prec = tgid ? 10 : 2; + const char *space = " "; + int prec = tgid ? 12 : 2; print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 4d1893564912..000e9dc224c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -497,7 +497,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%8.8s-%-5d %3d", + trace_seq_printf(s, "%8.8s-%-7d %3d", comm, entry->pid, cpu); return trace_print_lat_fmt(s, entry); @@ -588,15 +588,15 @@ int trace_print_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { unsigned int tgid = trace_find_tgid(entry->pid); if (!tgid) - trace_seq_printf(s, "(-----) "); + trace_seq_printf(s, "(-------) "); else - trace_seq_printf(s, "(%5d) ", tgid); + trace_seq_printf(s, "(%7d) ", tgid); } trace_seq_printf(s, "[%03d] ", iter->cpu); @@ -636,7 +636,7 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); trace_seq_printf( - s, "%16s %5d %3d %d %08x %08lx ", + s, "%16s %7d %3d %d %08x %08lx ", comm, entry->pid, iter->cpu, entry->flags, entry->preempt_count, iter->idx); } else { @@ -917,7 +917,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, S = task_index_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, - " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + " %7d:%3d:%c %s [%03d] %7d:%3d:%c %s\n", field->prev_pid, field->prev_prio, S, delim, -- cgit v1.2.3 From 54fa9ba564b717fc2cf689427e195c360315999d Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 7 Sep 2020 11:32:07 +0200 Subject: ftrace: Let ftrace_enable_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of ftrace_enable_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/trace/ftrace.c:7544:43: warning: incorrect type in argument 3 (different address spaces) kernel/trace/ftrace.c:7544:43: expected void * kernel/trace/ftrace.c:7544:43: got void [noderef] __user *buffer Link: https://lkml.kernel.org/r/20200907093207.13540-1-tklauser@distanz.ch Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Cc: Andrew Morton Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Tobias Klauser Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e9f893388245..603255f5f085 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7534,8 +7534,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; -- cgit v1.2.3 From 46bbe5c671e06f070428b9be142cc4ee5cedebac Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 7 Sep 2020 06:58:45 -0700 Subject: tracing: fix double free clang static analyzer reports this problem trace_events_hist.c:3824:3: warning: Attempt to free released memory kfree(hist_data->attrs->var_defs.name[i]); In parse_var_defs() if there is a problem allocating var_defs.expr, the earlier var_defs.name is freed. This free is duplicated by free_var_defs() which frees the rest of the list. Because free_var_defs() has to run anyway, remove the second free fom parse_var_defs(). Link: https://lkml.kernel.org/r/20200907135845.15804-1-trix@redhat.com Cc: stable@vger.kernel.org Fixes: 30350d65ac56 ("tracing: Add variable support to hist triggers") Reviewed-by: Tom Zanussi Signed-off-by: Tom Rix Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0b933546142e..1b2ef6490229 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3865,7 +3865,6 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) s = kstrdup(field_str, GFP_KERNEL); if (!s) { - kfree(hist_data->attrs->var_defs.name[n_vars]); ret = -ENOMEM; goto free; } -- cgit v1.2.3 From 82d083ab60c3693201c6f5c7a5f23a6ed422098d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 17:55:05 +0900 Subject: kprobes: tracing/kprobes: Fix to kill kprobes on initmem after boot Since kprobe_event= cmdline option allows user to put kprobes on the functions in initmem, kprobe has to make such probes gone after boot. Currently the probes on the init functions in modules will be handled by module callback, but the kernel init text isn't handled. Without this, kprobes may access non-exist text area to disable or remove it. Link: https://lkml.kernel.org/r/159972810544.428528.1839307531600646955.stgit@devnote2 Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Cc: Jonathan Corbet Cc: Shuah Khan Cc: Randy Dunlap Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d43b48ecdb4f..d750e025d1ff 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2453,6 +2453,28 @@ static struct notifier_block kprobe_module_nb = { extern unsigned long __start_kprobe_blacklist[]; extern unsigned long __stop_kprobe_blacklist[]; +void kprobe_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + struct hlist_head *head; + struct kprobe *p; + int i; + + mutex_lock(&kprobe_mutex); + + /* Kill all kprobes on initmem */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry(p, head, hlist) { + if (start <= (void *)p->addr && (void *)p->addr < end) + kill_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); +} + static int __init init_kprobes(void) { int i, err = 0; -- cgit v1.2.3 From b0399092ccebd9feef68d4ceb8d6219a8c0caa05 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 18 Sep 2020 21:20:21 -0700 Subject: kprobes: fix kill kprobe which has been marked as gone If a kprobe is marked as gone, we should not kill it again. Otherwise, we can disarm the kprobe more than once. In that case, the statistics of kprobe_ftrace_enabled can unbalance which can lead to that kprobe do not work. Fixes: e8386a0cb22f ("kprobes: support probing module __exit function") Co-developed-by: Chengming Zhou Signed-off-by: Muchun Song Signed-off-by: Chengming Zhou Signed-off-by: Andrew Morton Acked-by: Masami Hiramatsu Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Song Liu Cc: Steven Rostedt Cc: Link: https://lkml.kernel.org/r/20200822030055.32383-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 287b263c9cb9..049da84e1952 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2140,6 +2140,9 @@ static void kill_kprobe(struct kprobe *p) lockdep_assert_held(&kprobe_mutex); + if (WARN_ON_ONCE(kprobe_gone(p))) + return; + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* @@ -2419,7 +2422,10 @@ static int kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry(p, head, hlist) + hlist_for_each_entry(p, head, hlist) { + if (kprobe_gone(p)) + continue; + if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2436,6 +2442,7 @@ static int kprobes_module_callback(struct notifier_block *nb, */ kill_kprobe(p); } + } } if (val == MODULE_STATE_GOING) remove_module_kprobe_blacklist(mod); -- cgit v1.2.3 From 7bb82ac30c3dd4ecf1485685cbe84d2ba10dddf4 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Sep 2020 21:20:34 -0700 Subject: ftrace: let ftrace_enable_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of ftrace_enable_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/trace/ftrace.c:7544:43: warning: incorrect type in argument 3 (different address spaces) kernel/trace/ftrace.c:7544:43: expected void * kernel/trace/ftrace.c:7544:43: got void [noderef] __user *buffer Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Al Viro Link: https://lkml.kernel.org/r/20200907093207.13540-1-tklauser@distanz.ch Signed-off-by: Linus Torvalds --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 275441254bb5..e9fa580f3083 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7531,8 +7531,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; -- cgit v1.2.3 From 4773ef33fc6e59bad2e5d19e334de2fa79c27b74 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Sep 2020 21:20:37 -0700 Subject: stackleak: let stack_erasing_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of stack_erasing_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/stackleak.c:31:50: warning: incorrect type in argument 3 (different address spaces) kernel/stackleak.c:31:50: expected void * kernel/stackleak.c:31:50: got void [noderef] __user *buffer Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Al Viro Link: https://lkml.kernel.org/r/20200907093253.13656-1-tklauser@distanz.ch Signed-off-by: Linus Torvalds --- kernel/stackleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index a8fc9ae1d03d..ce161a8e8d97 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -20,7 +20,7 @@ static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); int stack_erasing_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); -- cgit v1.2.3 From 3ad1c8ef083bef96ec922688966484be1039e6b5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 21 Sep 2020 12:31:36 +0200 Subject: rcu/tree: Export rcu_idle_{enter,exit} to modules Fix this link error: ERROR: modpost: "rcu_idle_enter" [drivers/acpi/processor.ko] undefined! ERROR: modpost: "rcu_idle_exit" [drivers/acpi/processor.ko] undefined! when CONFIG_ACPI_PROCESSOR is built as module. PeterZ says that in light of ARM needing those soon too, they should simply be exported. Fixes: 1fecfdbb7acc ("ACPI: processor: Take over RCU-idle for C3-BM idle") Reported-by: Sven Joachim Suggested-by: Peter Zijlstra Signed-off-by: Borislav Petkov Reviewed-by: Paul E. McKenney Signed-off-by: Rafael J. Wysocki --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ce77d9ac716..f78ee759af9c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -673,6 +673,7 @@ void rcu_idle_enter(void) lockdep_assert_irqs_disabled(); rcu_eqs_enter(false); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -886,6 +887,7 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** -- cgit v1.2.3 From e23bb04b0c938588eae41b7f4712b722290ed2b8 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Sat, 19 Sep 2020 22:01:33 -0700 Subject: bpf: Fix sysfs export of empty BTF section If BTF data is missing or removed from the ELF section it is still exported via sysfs as a zero-length file: root@OpenWrt:/# ls -l /sys/kernel/btf/vmlinux -r--r--r-- 1 root root 0 Jul 18 02:59 /sys/kernel/btf/vmlinux Moreover, reads from this file succeed and leak kernel data: root@OpenWrt:/# hexdump -C /sys/kernel/btf/vmlinux|head -10 000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 000cc0 00 00 00 00 00 00 00 00 00 00 00 00 80 83 b0 80 |................| 000cd0 00 10 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| 000ce0 00 00 00 00 00 00 00 00 00 00 00 00 57 ac 6e 9d |............W.n.| 000cf0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 002650 00 00 00 00 00 00 00 10 00 00 00 01 00 00 00 01 |................| 002660 80 82 9a c4 80 85 97 80 81 a9 51 68 00 00 00 02 |..........Qh....| 002670 80 25 44 dc 80 85 97 80 81 a9 50 24 81 ab c4 60 |.%D.......P$...`| This situation was first observed with kernel 5.4.x, cross-compiled for a MIPS target system. Fix by adding a sanity-check for export of zero-length data sections. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b38db205a66238f70823039a8c531535864eaac5.1600417359.git.Tony.Ambardar@gmail.com --- kernel/bpf/sysfs_btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 3b495773de5a..11b3380887fa 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -30,15 +30,15 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!__start_BTF) + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; + + if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } -- cgit v1.2.3 From 008cfe4418b3dbda2ff820cdd7b1a5ce458ae444 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:57 -0400 Subject: mm: Introduce mm_struct.has_pinned (Commit message majorly collected from Jason Gunthorpe) Reduce the chance of false positive from page_maybe_dma_pinned() by keeping track if the mm_struct has ever been used with pin_user_pages(). This allows cases that might drive up the page ref_count to avoid any penalty from handling dma_pinned pages. Future work is planned, to provide a more sophisticated solution, likely to turn it into a real counter. For now, make it atomic_t but use it as a boolean for simplicity. Suggested-by: Jason Gunthorpe Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 49677d668de4..e65d8192d080 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1011,6 +1011,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; + atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); -- cgit v1.2.3 From 7a4830c380f3a8b3425f6383deff58e65b2557b5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:58 -0400 Subject: mm/fork: Pass new vma pointer into copy_page_range() This prepares for the future work to trigger early cow on pinned pages during fork(). No functional change intended. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e65d8192d080..da8d360fb032 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -589,7 +589,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, mpnt, tmp); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); -- cgit v1.2.3 From 851e6f61cd076954f9d521e0d79b173ad3a2453b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 29 Sep 2020 12:27:23 -0400 Subject: tracing: Fix trace_find_next_entry() accounting of temp buffer size The temp buffer size variable for trace_find_next_entry() was incorrectly being updated when the size did not change. The temp buffer size should only be updated when it is reallocated. This is mostly an issue when used with ftrace_dump(). That's because ftrace_dump() can not allocate a new buffer, and instead uses a temporary buffer with a fix size. But the variable that keeps track of that size is incorrectly updated with each call, and it could fall into the path that would try to reallocate the buffer and produce a warning. ------------[ cut here ]------------ WARNING: CPU: 1 PID: 1601 at kernel/trace/trace.c:3548 trace_find_next_entry+0xd0/0xe0 Modules linked in [..] CPU: 1 PID: 1601 Comm: bash Not tainted 5.9.0-rc5-test+ #521 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:trace_find_next_entry+0xd0/0xe0 Code: 40 21 00 00 4c 89 e1 31 d2 4c 89 ee 48 89 df e8 c6 9e ff ff 89 ab 54 21 00 00 5b 5d 41 5c 41 5d c3 48 63 d5 eb bf 31 c0 eb f0 <0f> 0b 48 63 d5 eb b4 66 0f 1f 84 00 00 00 00 00 53 48 8d 8f 60 21 RSP: 0018:ffff95a4f2e8bd70 EFLAGS: 00010046 RAX: ffffffff96679fc0 RBX: ffffffff97910de0 RCX: ffffffff96679fc0 RDX: ffff95a4f2e8bd98 RSI: ffff95a4ee321098 RDI: ffffffff97913000 RBP: 0000000000000018 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000046 R12: ffff95a4f2e8bd98 R13: 0000000000000000 R14: ffff95a4ee321098 R15: 00000000009aa301 FS: 00007f8565484740(0000) GS:ffff95a55aa40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055876bd43d90 CR3: 00000000b76e6003 CR4: 00000000001706e0 Call Trace: trace_print_lat_context+0x58/0x2d0 ? cpumask_next+0x16/0x20 print_trace_line+0x1a4/0x4f0 ftrace_dump.cold+0xad/0x12c __handle_sysrq.cold+0x51/0x126 write_sysrq_trigger+0x3f/0x4a proc_reg_write+0x53/0x80 vfs_write+0xca/0x210 ksys_write+0x70/0xf0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f8565579487 Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007ffd40707948 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f8565579487 RDX: 0000000000000002 RSI: 000055876bd74de0 RDI: 0000000000000001 RBP: 000055876bd74de0 R08: 000000000000000a R09: 0000000000000001 R10: 000055876bdec280 R11: 0000000000000246 R12: 0000000000000002 R13: 00007f856564a500 R14: 0000000000000002 R15: 00007f856564a700 irq event stamp: 109958 ---[ end trace 7aab5b7e51484b00 ]--- Not only fix the updating of the temp buffer, but also do not free the temp buffer before a new buffer is allocated (there's no reason to not continue to use the current temp buffer if an allocation fails). Cc: stable@vger.kernel.org Fixes: 8e99cf91b99bb ("tracing: Do not allocate buffer in trace_find_next_entry() in atomic") Reported-by: Anna-Maria Behnsen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a7c26345e83..d3e5de717df2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3546,13 +3546,15 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, if (iter->ent && iter->ent != iter->temp) { if ((!iter->temp || iter->temp_size < iter->ent_size) && !WARN_ON_ONCE(iter->temp == static_temp_buf)) { - kfree(iter->temp); - iter->temp = kmalloc(iter->ent_size, GFP_KERNEL); - if (!iter->temp) + void *temp; + temp = kmalloc(iter->ent_size, GFP_KERNEL); + if (!temp) return NULL; + kfree(iter->temp); + iter->temp = temp; + iter->temp_size = iter->ent_size; } memcpy(iter->temp, iter->ent, iter->ent_size); - iter->temp_size = iter->ent_size; iter->ent = iter->temp; } entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); -- cgit v1.2.3 From b40341fad6cc2daa195f8090fd3348f18fff640a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 29 Sep 2020 12:40:31 -0400 Subject: ftrace: Move RCU is watching check after recursion check The first thing that the ftrace function callback helper functions should do is to check for recursion. Peter Zijlstra found that when "rcu_is_watching()" had its notrace removed, it caused perf function tracing to crash. This is because the call of rcu_is_watching() is tested before function recursion is checked and and if it is traced, it will cause an infinite recursion loop. rcu_is_watching() should still stay notrace, but to prevent this should never had crashed in the first place. The recursion prevention must be the first thing done in callback functions. Link: https://lore.kernel.org/r/20200929112541.GM2628@hirez.programming.kicks-ass.net Cc: stable@vger.kernel.org Cc: Paul McKenney Fixes: c68c0fa293417 ("ftrace: Have ftrace_ops_get_func() handle RCU and PER_CPU flags too") Acked-by: Peter Zijlstra (Intel) Reported-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 603255f5f085..541453927c82 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6993,16 +6993,14 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) - return; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; preempt_disable_notrace(); - op->func(ip, parent_ip, op, regs); + if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) + op->func(ip, parent_ip, op, regs); preempt_enable_notrace(); trace_clear_recursion(bit); -- cgit v1.2.3 From 4013c1496c49615d90d36b9d513eee8e369778e9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 5 Oct 2020 10:56:22 -0700 Subject: usermodehelper: reset umask to default before executing user process Kernel threads intentionally do CLONE_FS in order to follow any changes that 'init' does to set up the root directory (or cwd). It is admittedly a bit odd, but it avoids the situation where 'init' does some extensive setup to initialize the system environment, and then we execute a usermode helper program, and it uses the original FS setup from boot time that may be very limited and incomplete. [ Both Al Viro and Eric Biederman point out that 'pivot_root()' will follow the root regardless, since it fixes up other users of root (see chroot_fs_refs() for details), but overmounting root and doing a chroot() would not. ] However, Vegard Nossum noticed that the CLONE_FS not only means that we follow the root and current working directories, it also means we share umask with whatever init changed it to. That wasn't intentional. Just reset umask to the original default (0022) before actually starting the usermode helper program. Reported-by: Vegard Nossum Cc: Al Viro Acked-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/umh.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index fcf3ee803630..3f646613a9d3 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,14 @@ static int call_usermodehelper_exec_async(void *data) flush_signal_handlers(current, 1); spin_unlock_irq(¤t->sighand->siglock); + /* + * Initial kernel threads share ther FS with init, in order to + * get the init root directory. But we've now created a new + * thread that is going to execve a user process and has its own + * 'struct fs_struct'. Reset umask to the default. + */ + current->fs->umask = 0022; + /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. -- cgit v1.2.3 From 5b9fbeb75b6a98955f628e205ac26689bcb1383e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 7 Oct 2020 15:48:58 +0200 Subject: bpf: Fix scalar32_min_max_or bounds tracking Simon reported an issue with the current scalar32_min_max_or() implementation. That is, compared to the other 32 bit subreg tracking functions, the code in scalar32_min_max_or() stands out that it's using the 64 bit registers instead of 32 bit ones. This leads to bounds tracking issues, for example: [...] 8: R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R10=fp0 fp-8=mmmmmmmm 8: (79) r1 = *(u64 *)(r0 +0) R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R10=fp0 fp-8=mmmmmmmm 9: R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R1_w=inv(id=0) R10=fp0 fp-8=mmmmmmmm 9: (b7) r0 = 1 10: R0_w=inv1 R1_w=inv(id=0) R10=fp0 fp-8=mmmmmmmm 10: (18) r2 = 0x600000002 12: R0_w=inv1 R1_w=inv(id=0) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 12: (ad) if r1 < r2 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umin_value=25769803778) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 13: R0_w=inv1 R1_w=inv(id=0,umin_value=25769803778) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 13: (95) exit 14: R0_w=inv1 R1_w=inv(id=0,umax_value=25769803777,var_off=(0x0; 0x7ffffffff)) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 14: (25) if r1 > 0x0 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umax_value=0,var_off=(0x0; 0x7fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 15: R0_w=inv1 R1_w=inv(id=0,umax_value=0,var_off=(0x0; 0x7fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 15: (95) exit 16: R0_w=inv1 R1_w=inv(id=0,umin_value=1,umax_value=25769803777,var_off=(0x0; 0x77fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 16: (47) r1 |= 0 17: R0_w=inv1 R1_w=inv(id=0,umin_value=1,umax_value=32212254719,var_off=(0x1; 0x700000000),s32_max_value=1,u32_max_value=1) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm [...] The bound tests on the map value force the upper unsigned bound to be 25769803777 in 64 bit (0b11000000000000000000000000000000001) and then lower one to be 1. By using OR they are truncated and thus result in the range [1,1] for the 32 bit reg tracker. This is incorrect given the only thing we know is that the value must be positive and thus 2147483647 (0b1111111111111111111111111111111) at max for the subregs. Fix it by using the {u,s}32_{min,max}_value vars instead. This also makes sense, for example, for the case where we update dst_reg->s32_{min,max}_value in the else branch we need to use the newly computed dst_reg->u32_{min,max}_value as we know that these are positive. Previously, in the else branch the 64 bit values of umin_value=1 and umax_value=32212254719 were used and latter got truncated to be 1 as upper bound there. After the fix the subreg range is now correct: [...] 8: R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R10=fp0 fp-8=mmmmmmmm 8: (79) r1 = *(u64 *)(r0 +0) R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R10=fp0 fp-8=mmmmmmmm 9: R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R1_w=inv(id=0) R10=fp0 fp-8=mmmmmmmm 9: (b7) r0 = 1 10: R0_w=inv1 R1_w=inv(id=0) R10=fp0 fp-8=mmmmmmmm 10: (18) r2 = 0x600000002 12: R0_w=inv1 R1_w=inv(id=0) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 12: (ad) if r1 < r2 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umin_value=25769803778) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 13: R0_w=inv1 R1_w=inv(id=0,umin_value=25769803778) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 13: (95) exit 14: R0_w=inv1 R1_w=inv(id=0,umax_value=25769803777,var_off=(0x0; 0x7ffffffff)) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 14: (25) if r1 > 0x0 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umax_value=0,var_off=(0x0; 0x7fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 15: R0_w=inv1 R1_w=inv(id=0,umax_value=0,var_off=(0x0; 0x7fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 15: (95) exit 16: R0_w=inv1 R1_w=inv(id=0,umin_value=1,umax_value=25769803777,var_off=(0x0; 0x77fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 16: (47) r1 |= 0 17: R0_w=inv1 R1_w=inv(id=0,umin_value=1,umax_value=32212254719,var_off=(0x0; 0x77fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm [...] Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Simon Scannell Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47e74f09fa37..fba52d9ec8fc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5667,8 +5667,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->smin_value; - u32 umin_val = src_reg->umin_value; + s32 smin_val = src_reg->s32_min_value; + u32 umin_val = src_reg->u32_min_value; /* Assuming scalar64_min_max_or will be called so it is safe * to skip updating register for known case. @@ -5691,8 +5691,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* ORing two positives gives a positive, so safe to * cast result into s64. */ - dst_reg->s32_min_value = dst_reg->umin_value; - dst_reg->s32_max_value = dst_reg->umax_value; + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; } } -- cgit v1.2.3 From 6d6b8b9f4fceab7266ca03d194f60ec72bd4b654 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Thu, 27 Aug 2020 12:17:32 +0530 Subject: perf: Fix task_function_call() error handling The error handling introduced by commit: 2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()") looses any return value from smp_call_function_single() that is not {0, -EINVAL}. This is a problem because it will return -EXNIO when the target CPU is offline. Worse, in that case it'll turn into an infinite loop. Fixes: 2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()") Reported-by: Srikar Dronamraju Signed-off-by: Kajol Jain Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Barret Rhoden Tested-by: Srikar Dronamraju Link: https://lkml.kernel.org/r/20200827064732.20860-1-kjain@linux.ibm.com --- kernel/events/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7ed5248f0445..e8bf92202542 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -99,7 +99,7 @@ static void remote_function(void *data) * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * - * returns @func return value or -ESRCH when the process isn't running + * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -115,7 +115,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - ret = !ret ? data.ret : -EAGAIN; + if (!ret) + ret = data.ret; if (ret != -EAGAIN) break; -- cgit v1.2.3 From b2058cd93d930d7b9f76f34590c0d432cd6470c7 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 9 Dec 2020 17:26:35 -0800 Subject: Input: gtco - remove driver The driver has its own HID descriptor parsing code, that had and still has several issues discovered by syzbot and other tools. Ideally we should move the driver over to the HID subsystem, so that it uses proven parsing code. However the devices in question are EOL, and GTCO is not willing to extend resources for that, so let's simply remove the driver. Note that our HID support has greatly improved over the last 10 years, we may also consider reverting 6f8d9e26e7de ("hid-core.c: Adds all GTCO CalComp Digitizers and InterWrite School Products to blacklist") and see if GTCO devices actually work with normal HID drivers. Link: https://lore.kernel.org/r/X8wbBtO5KidME17K@google.com Signed-off-by: Dmitry Torokhov --- kernel/configs/android-recommended.config | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 81e9af7dcec2..53d688bdd894 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -111,7 +111,6 @@ CONFIG_STRICT_KERNEL_RWX=y CONFIG_SUSPEND_TIME=y CONFIG_TABLET_USB_ACECAD=y CONFIG_TABLET_USB_AIPTEK=y -CONFIG_TABLET_USB_GTCO=y CONFIG_TABLET_USB_HANWANG=y CONFIG_TABLET_USB_KBTAB=y CONFIG_TASKSTATS=y -- cgit v1.2.3