From dfd402a4c4baae42398ce9180ff424d589b8bffc Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:02:54 +0100 Subject: kcsan: Add Kernel Concurrency Sanitizer infrastructure Kernel Concurrency Sanitizer (KCSAN) is a dynamic data-race detector for kernel space. KCSAN is a sampling watchpoint-based data-race detector. See the included Documentation/dev-tools/kcsan.rst for more details. This patch adds basic infrastructure, but does not yet enable KCSAN for any architecture. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/Makefile | 1 + kernel/kcsan/Makefile | 11 + kernel/kcsan/atomic.h | 27 +++ kernel/kcsan/core.c | 626 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kcsan/debugfs.c | 275 +++++++++++++++++++++ kernel/kcsan/encoding.h | 94 ++++++++ kernel/kcsan/kcsan.h | 108 +++++++++ kernel/kcsan/report.c | 320 +++++++++++++++++++++++++ kernel/kcsan/test.c | 121 ++++++++++ 9 files changed, 1583 insertions(+) create mode 100644 kernel/kcsan/Makefile create mode 100644 kernel/kcsan/atomic.h create mode 100644 kernel/kcsan/core.c create mode 100644 kernel/kcsan/debugfs.c create mode 100644 kernel/kcsan/encoding.h create mode 100644 kernel/kcsan/kcsan.h create mode 100644 kernel/kcsan/report.c create mode 100644 kernel/kcsan/test.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index daad787fb795..74ab46e2ebd1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile new file mode 100644 index 000000000000..dd15b62ec0b5 --- /dev/null +++ b/kernel/kcsan/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +KCSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ + $(call cc-option,-fno-stack-protector,) + +obj-y := core.o debugfs.o report.o +obj-$(CONFIG_KCSAN_SELFTEST) += test.o diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h new file mode 100644 index 000000000000..c9c3fe628011 --- /dev/null +++ b/kernel/kcsan/atomic.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_KCSAN_ATOMIC_H +#define _KERNEL_KCSAN_ATOMIC_H + +#include + +/* + * Helper that returns true if access to ptr should be considered as an atomic + * access, even though it is not explicitly atomic. + * + * List all volatile globals that have been observed in races, to suppress + * data race reports between accesses to these variables. + * + * For now, we assume that volatile accesses of globals are as strong as atomic + * accesses (READ_ONCE, WRITE_ONCE cast to volatile). The situation is still not + * entirely clear, as on some architectures (Alpha) READ_ONCE/WRITE_ONCE do more + * than cast to volatile. Eventually, we hope to be able to remove this + * function. + */ +static inline bool kcsan_is_atomic(const volatile void *ptr) +{ + /* only jiffies for now */ + return ptr == &jiffies; +} + +#endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c new file mode 100644 index 000000000000..d9410d58c93e --- /dev/null +++ b/kernel/kcsan/core.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "atomic.h" +#include "encoding.h" +#include "kcsan.h" + +bool kcsan_enabled; + +/* Per-CPU kcsan_ctx for interrupts */ +static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { + .disable_count = 0, + .atomic_next = 0, + .atomic_nest_count = 0, + .in_flat_atomic = false, +}; + +/* + * Helper macros to index into adjacent slots slots, starting from address slot + * itself, followed by the right and left slots. + * + * The purpose is 2-fold: + * + * 1. if during insertion the address slot is already occupied, check if + * any adjacent slots are free; + * 2. accesses that straddle a slot boundary due to size that exceeds a + * slot's range may check adjacent slots if any watchpoint matches. + * + * Note that accesses with very large size may still miss a watchpoint; however, + * given this should be rare, this is a reasonable trade-off to make, since this + * will avoid: + * + * 1. excessive contention between watchpoint checks and setup; + * 2. larger number of simultaneous watchpoints without sacrificing + * performance. + * + * Example: SLOT_IDX values for KCSAN_CHECK_ADJACENT=1, where i is [0, 1, 2]: + * + * slot=0: [ 1, 2, 0] + * slot=9: [10, 11, 9] + * slot=63: [64, 65, 63] + */ +#define NUM_SLOTS (1 + 2 * KCSAN_CHECK_ADJACENT) +#define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) + +/* + * SLOT_IDX_FAST is used in fast-path. Not first checking the address's primary + * slot (middle) is fine if we assume that data races occur rarely. The set of + * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to + * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. + */ +#define SLOT_IDX_FAST(slot, i) (slot + i) + +/* + * Watchpoints, with each entry encoded as defined in encoding.h: in order to be + * able to safely update and access a watchpoint without introducing locking + * overhead, we encode each watchpoint as a single atomic long. The initial + * zero-initialized state matches INVALID_WATCHPOINT. + * + * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to + * use more complicated SLOT_IDX_FAST calculation with modulo in fast-path. + */ +static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS - 1]; + +/* + * Instructions to skip watching counter, used in should_watch(). We use a + * per-CPU counter to avoid excessive contention. + */ +static DEFINE_PER_CPU(long, kcsan_skip); + +static inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, + bool expect_write, + long *encoded_watchpoint) +{ + const int slot = watchpoint_slot(addr); + const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK; + atomic_long_t *watchpoint; + unsigned long wp_addr_masked; + size_t wp_size; + bool is_write; + int i; + + BUILD_BUG_ON(CONFIG_KCSAN_NUM_WATCHPOINTS < NUM_SLOTS); + + for (i = 0; i < NUM_SLOTS; ++i) { + watchpoint = &watchpoints[SLOT_IDX_FAST(slot, i)]; + *encoded_watchpoint = atomic_long_read(watchpoint); + if (!decode_watchpoint(*encoded_watchpoint, &wp_addr_masked, + &wp_size, &is_write)) + continue; + + if (expect_write && !is_write) + continue; + + /* Check if the watchpoint matches the access. */ + if (matching_access(wp_addr_masked, wp_size, addr_masked, size)) + return watchpoint; + } + + return NULL; +} + +static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, + bool is_write) +{ + const int slot = watchpoint_slot(addr); + const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); + atomic_long_t *watchpoint; + int i; + + /* Check slot index logic, ensuring we stay within array bounds. */ + BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT); + BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT + 1) != 0); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, + KCSAN_CHECK_ADJACENT) != + ARRAY_SIZE(watchpoints) - 1); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, + KCSAN_CHECK_ADJACENT + 1) != + ARRAY_SIZE(watchpoints) - NUM_SLOTS); + + for (i = 0; i < NUM_SLOTS; ++i) { + long expect_val = INVALID_WATCHPOINT; + + /* Try to acquire this slot. */ + watchpoint = &watchpoints[SLOT_IDX(slot, i)]; + if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, + encoded_watchpoint)) + return watchpoint; + } + + return NULL; +} + +/* + * Return true if watchpoint was successfully consumed, false otherwise. + * + * This may return false if: + * + * 1. another thread already consumed the watchpoint; + * 2. the thread that set up the watchpoint already removed it; + * 3. the watchpoint was removed and then re-used. + */ +static inline bool try_consume_watchpoint(atomic_long_t *watchpoint, + long encoded_watchpoint) +{ + return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, + CONSUMED_WATCHPOINT); +} + +/* + * Return true if watchpoint was not touched, false if consumed. + */ +static inline bool remove_watchpoint(atomic_long_t *watchpoint) +{ + return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != + CONSUMED_WATCHPOINT; +} + +static inline struct kcsan_ctx *get_ctx(void) +{ + /* + * In interrupt, use raw_cpu_ptr to avoid unnecessary checks, that would + * also result in calls that generate warnings in uaccess regions. + */ + return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); +} + +static inline bool is_atomic(const volatile void *ptr) +{ + struct kcsan_ctx *ctx = get_ctx(); + + if (unlikely(ctx->atomic_next > 0)) { + /* + * Because we do not have separate contexts for nested + * interrupts, in case atomic_next is set, we simply assume that + * the outer interrupt set atomic_next. In the worst case, we + * will conservatively consider operations as atomic. This is a + * reasonable trade-off to make, since this case should be + * extremely rare; however, even if extremely rare, it could + * lead to false positives otherwise. + */ + if ((hardirq_count() >> HARDIRQ_SHIFT) < 2) + --ctx->atomic_next; /* in task, or outer interrupt */ + return true; + } + if (unlikely(ctx->atomic_nest_count > 0 || ctx->in_flat_atomic)) + return true; + + return kcsan_is_atomic(ptr); +} + +static inline bool should_watch(const volatile void *ptr, int type) +{ + /* + * Never set up watchpoints when memory operations are atomic. + * + * Need to check this first, before kcsan_skip check below: (1) atomics + * should not count towards skipped instructions, and (2) to actually + * decrement kcsan_atomic_next for consecutive instruction stream. + */ + if ((type & KCSAN_ACCESS_ATOMIC) != 0 || is_atomic(ptr)) + return false; + + if (this_cpu_dec_return(kcsan_skip) >= 0) + return false; + + /* + * NOTE: If we get here, kcsan_skip must always be reset in slow path + * via reset_kcsan_skip() to avoid underflow. + */ + + /* this operation should be watched */ + return true; +} + +static inline void reset_kcsan_skip(void) +{ + long skip_count = CONFIG_KCSAN_SKIP_WATCH - + (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? + prandom_u32_max(CONFIG_KCSAN_SKIP_WATCH) : + 0); + this_cpu_write(kcsan_skip, skip_count); +} + +static inline bool kcsan_is_enabled(void) +{ + return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; +} + +static inline unsigned int get_delay(void) +{ + unsigned int delay = in_task() ? CONFIG_KCSAN_UDELAY_TASK : + CONFIG_KCSAN_UDELAY_INTERRUPT; + return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? + prandom_u32_max(delay) : + 0); +} + +/* + * Pull everything together: check_access() below contains the performance + * critical operations; the fast-path (including check_access) functions should + * all be inlinable by the instrumentation functions. + * + * The slow-path (kcsan_found_watchpoint, kcsan_setup_watchpoint) are + * non-inlinable -- note that, we prefix these with "kcsan_" to ensure they can + * be filtered from the stacktrace, as well as give them unique names for the + * UACCESS whitelist of objtool. Each function uses user_access_save/restore(), + * since they do not access any user memory, but instrumentation is still + * emitted in UACCESS regions. + */ + +static noinline void kcsan_found_watchpoint(const volatile void *ptr, + size_t size, bool is_write, + atomic_long_t *watchpoint, + long encoded_watchpoint) +{ + unsigned long flags; + bool consumed; + + if (!kcsan_is_enabled()) + return; + /* + * Consume the watchpoint as soon as possible, to minimize the chances + * of !consumed. Consuming the watchpoint must always be guarded by + * kcsan_is_enabled() check, as otherwise we might erroneously + * triggering reports when disabled. + */ + consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint); + + /* keep this after try_consume_watchpoint */ + flags = user_access_save(); + + if (consumed) { + kcsan_report(ptr, size, is_write, true, raw_smp_processor_id(), + KCSAN_REPORT_CONSUMED_WATCHPOINT); + } else { + /* + * The other thread may not print any diagnostics, as it has + * already removed the watchpoint, or another thread consumed + * the watchpoint before this thread. + */ + kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); + } + kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + + user_access_restore(flags); +} + +static noinline void kcsan_setup_watchpoint(const volatile void *ptr, + size_t size, bool is_write) +{ + atomic_long_t *watchpoint; + union { + u8 _1; + u16 _2; + u32 _4; + u64 _8; + } expect_value; + bool value_change = false; + unsigned long ua_flags = user_access_save(); + unsigned long irq_flags; + + /* + * Always reset kcsan_skip counter in slow-path to avoid underflow; see + * should_watch(). + */ + reset_kcsan_skip(); + + if (!kcsan_is_enabled()) + goto out; + + if (!check_encodable((unsigned long)ptr, size)) { + kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); + goto out; + } + + /* + * Disable interrupts & preemptions to avoid another thread on the same + * CPU accessing memory locations for the set up watchpoint; this is to + * avoid reporting races to e.g. CPU-local data. + * + * An alternative would be adding the source CPU to the watchpoint + * encoding, and checking that watchpoint-CPU != this-CPU. There are + * several problems with this: + * 1. we should avoid stealing more bits from the watchpoint encoding + * as it would affect accuracy, as well as increase performance + * overhead in the fast-path; + * 2. if we are preempted, but there *is* a genuine data race, we + * would *not* report it -- since this is the common case (vs. + * CPU-local data accesses), it makes more sense (from a data race + * detection point of view) to simply disable preemptions to ensure + * as many tasks as possible run on other CPUs. + */ + local_irq_save(irq_flags); + + watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); + if (watchpoint == NULL) { + /* + * Out of capacity: the size of `watchpoints`, and the frequency + * with which `should_watch()` returns true should be tweaked so + * that this case happens very rarely. + */ + kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); + goto out_unlock; + } + + kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS); + kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS); + + /* + * Read the current value, to later check and infer a race if the data + * was modified via a non-instrumented access, e.g. from a device. + */ + switch (size) { + case 1: + expect_value._1 = READ_ONCE(*(const u8 *)ptr); + break; + case 2: + expect_value._2 = READ_ONCE(*(const u16 *)ptr); + break; + case 4: + expect_value._4 = READ_ONCE(*(const u32 *)ptr); + break; + case 8: + expect_value._8 = READ_ONCE(*(const u64 *)ptr); + break; + default: + break; /* ignore; we do not diff the values */ + } + + if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { + kcsan_disable_current(); + pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", + is_write ? "write" : "read", size, ptr, + watchpoint_slot((unsigned long)ptr), + encode_watchpoint((unsigned long)ptr, size, is_write)); + kcsan_enable_current(); + } + + /* + * Delay this thread, to increase probability of observing a racy + * conflicting access. + */ + udelay(get_delay()); + + /* + * Re-read value, and check if it is as expected; if not, we infer a + * racy access. + */ + switch (size) { + case 1: + value_change = expect_value._1 != READ_ONCE(*(const u8 *)ptr); + break; + case 2: + value_change = expect_value._2 != READ_ONCE(*(const u16 *)ptr); + break; + case 4: + value_change = expect_value._4 != READ_ONCE(*(const u32 *)ptr); + break; + case 8: + value_change = expect_value._8 != READ_ONCE(*(const u64 *)ptr); + break; + default: + break; /* ignore; we do not diff the values */ + } + + /* Check if this access raced with another. */ + if (!remove_watchpoint(watchpoint)) { + /* + * No need to increment 'data_races' counter, as the racing + * thread already did. + */ + kcsan_report(ptr, size, is_write, size > 8 || value_change, + smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); + } else if (value_change) { + /* Inferring a race, since the value should not have changed. */ + kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + kcsan_report(ptr, size, is_write, true, + smp_processor_id(), + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); + } + + kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); +out_unlock: + local_irq_restore(irq_flags); +out: + user_access_restore(ua_flags); +} + +static __always_inline void check_access(const volatile void *ptr, size_t size, + int type) +{ + const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; + atomic_long_t *watchpoint; + long encoded_watchpoint; + + /* + * Avoid user_access_save in fast-path: find_watchpoint is safe without + * user_access_save, as the address that ptr points to is only used to + * check if a watchpoint exists; ptr is never dereferenced. + */ + watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write, + &encoded_watchpoint); + /* + * It is safe to check kcsan_is_enabled() after find_watchpoint in the + * slow-path, as long as no state changes that cause a data race to be + * detected and reported have occurred until kcsan_is_enabled() is + * checked. + */ + + if (unlikely(watchpoint != NULL)) + kcsan_found_watchpoint(ptr, size, is_write, watchpoint, + encoded_watchpoint); + else if (unlikely(should_watch(ptr, type))) + kcsan_setup_watchpoint(ptr, size, is_write); +} + +/* === Public interface ===================================================== */ + +void __init kcsan_init(void) +{ + BUG_ON(!in_task()); + + kcsan_debugfs_init(); + + /* + * We are in the init task, and no other tasks should be running; + * WRITE_ONCE without memory barrier is sufficient. + */ + if (IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE)) + WRITE_ONCE(kcsan_enabled, true); +} + +/* === Exported interface =================================================== */ + +void kcsan_disable_current(void) +{ + ++get_ctx()->disable_count; +} +EXPORT_SYMBOL(kcsan_disable_current); + +void kcsan_enable_current(void) +{ + if (get_ctx()->disable_count-- == 0) { + /* + * Warn if kcsan_enable_current() calls are unbalanced with + * kcsan_disable_current() calls, which causes disable_count to + * become negative and should not happen. + */ + kcsan_disable_current(); /* restore to 0, KCSAN still enabled */ + kcsan_disable_current(); /* disable to generate warning */ + WARN(1, "Unbalanced %s()", __func__); + kcsan_enable_current(); + } +} +EXPORT_SYMBOL(kcsan_enable_current); + +void kcsan_nestable_atomic_begin(void) +{ + /* + * Do *not* check and warn if we are in a flat atomic region: nestable + * and flat atomic regions are independent from each other. + * See include/linux/kcsan.h: struct kcsan_ctx comments for more + * comments. + */ + + ++get_ctx()->atomic_nest_count; +} +EXPORT_SYMBOL(kcsan_nestable_atomic_begin); + +void kcsan_nestable_atomic_end(void) +{ + if (get_ctx()->atomic_nest_count-- == 0) { + /* + * Warn if kcsan_nestable_atomic_end() calls are unbalanced with + * kcsan_nestable_atomic_begin() calls, which causes + * atomic_nest_count to become negative and should not happen. + */ + kcsan_nestable_atomic_begin(); /* restore to 0 */ + kcsan_disable_current(); /* disable to generate warning */ + WARN(1, "Unbalanced %s()", __func__); + kcsan_enable_current(); + } +} +EXPORT_SYMBOL(kcsan_nestable_atomic_end); + +void kcsan_flat_atomic_begin(void) +{ + get_ctx()->in_flat_atomic = true; +} +EXPORT_SYMBOL(kcsan_flat_atomic_begin); + +void kcsan_flat_atomic_end(void) +{ + get_ctx()->in_flat_atomic = false; +} +EXPORT_SYMBOL(kcsan_flat_atomic_end); + +void kcsan_atomic_next(int n) +{ + get_ctx()->atomic_next = n; +} +EXPORT_SYMBOL(kcsan_atomic_next); + +void __kcsan_check_access(const volatile void *ptr, size_t size, int type) +{ + check_access(ptr, size, type); +} +EXPORT_SYMBOL(__kcsan_check_access); + +/* + * KCSAN uses the same instrumentation that is emitted by supported compilers + * for ThreadSanitizer (TSAN). + * + * When enabled, the compiler emits instrumentation calls (the functions + * prefixed with "__tsan" below) for all loads and stores that it generated; + * inline asm is not instrumented. + * + * Note that, not all supported compiler versions distinguish aligned/unaligned + * accesses, but e.g. recent versions of Clang do. We simply alias the unaligned + * version to the generic version, which can handle both. + */ + +#define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr) \ + { \ + check_access(ptr, size, 0); \ + } \ + EXPORT_SYMBOL(__tsan_read##size); \ + void __tsan_unaligned_read##size(void *ptr) \ + __alias(__tsan_read##size); \ + EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr) \ + { \ + check_access(ptr, size, KCSAN_ACCESS_WRITE); \ + } \ + EXPORT_SYMBOL(__tsan_write##size); \ + void __tsan_unaligned_write##size(void *ptr) \ + __alias(__tsan_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_write##size) + +DEFINE_TSAN_READ_WRITE(1); +DEFINE_TSAN_READ_WRITE(2); +DEFINE_TSAN_READ_WRITE(4); +DEFINE_TSAN_READ_WRITE(8); +DEFINE_TSAN_READ_WRITE(16); + +void __tsan_read_range(void *ptr, size_t size) +{ + check_access(ptr, size, 0); +} +EXPORT_SYMBOL(__tsan_read_range); + +void __tsan_write_range(void *ptr, size_t size) +{ + check_access(ptr, size, KCSAN_ACCESS_WRITE); +} +EXPORT_SYMBOL(__tsan_write_range); + +/* + * The below are not required by KCSAN, but can still be emitted by the + * compiler. + */ +void __tsan_func_entry(void *call_pc) +{ +} +EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void) +{ +} +EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void) +{ +} +EXPORT_SYMBOL(__tsan_init); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c new file mode 100644 index 000000000000..041d520a0183 --- /dev/null +++ b/kernel/kcsan/debugfs.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kcsan.h" + +/* + * Statistics counters. + */ +static atomic_long_t counters[KCSAN_COUNTER_COUNT]; + +/* + * Addresses for filtering functions from reporting. This list can be used as a + * whitelist or blacklist. + */ +static struct { + unsigned long *addrs; /* array of addresses */ + size_t size; /* current size */ + int used; /* number of elements used */ + bool sorted; /* if elements are sorted */ + bool whitelist; /* if list is a blacklist or whitelist */ +} report_filterlist = { + .addrs = NULL, + .size = 8, /* small initial size */ + .used = 0, + .sorted = false, + .whitelist = false, /* default is blacklist */ +}; +static DEFINE_SPINLOCK(report_filterlist_lock); + +static const char *counter_to_name(enum kcsan_counter_id id) +{ + switch (id) { + case KCSAN_COUNTER_USED_WATCHPOINTS: + return "used_watchpoints"; + case KCSAN_COUNTER_SETUP_WATCHPOINTS: + return "setup_watchpoints"; + case KCSAN_COUNTER_DATA_RACES: + return "data_races"; + case KCSAN_COUNTER_NO_CAPACITY: + return "no_capacity"; + case KCSAN_COUNTER_REPORT_RACES: + return "report_races"; + case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: + return "races_unknown_origin"; + case KCSAN_COUNTER_UNENCODABLE_ACCESSES: + return "unencodable_accesses"; + case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: + return "encoding_false_positives"; + case KCSAN_COUNTER_COUNT: + BUG(); + } + return NULL; +} + +void kcsan_counter_inc(enum kcsan_counter_id id) +{ + atomic_long_inc(&counters[id]); +} + +void kcsan_counter_dec(enum kcsan_counter_id id) +{ + atomic_long_dec(&counters[id]); +} + +/* + * The microbenchmark allows benchmarking KCSAN core runtime only. To run + * multiple threads, pipe 'microbench=' from multiple tasks into the + * debugfs file. + */ +static void microbenchmark(unsigned long iters) +{ + cycles_t cycles; + + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + + cycles = get_cycles(); + while (iters--) { + /* + * We can run this benchmark from multiple tasks; this address + * calculation increases likelyhood of some accesses overlapping + * (they still won't conflict because all are reads). + */ + unsigned long addr = + iters % (CONFIG_KCSAN_NUM_WATCHPOINTS * PAGE_SIZE); + __kcsan_check_read((void *)addr, sizeof(long)); + } + cycles = get_cycles() - cycles; + + pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); +} + +static int cmp_filterlist_addrs(const void *rhs, const void *lhs) +{ + const unsigned long a = *(const unsigned long *)rhs; + const unsigned long b = *(const unsigned long *)lhs; + + return a < b ? -1 : a == b ? 0 : 1; +} + +bool kcsan_skip_report_debugfs(unsigned long func_addr) +{ + unsigned long symbolsize, offset; + unsigned long flags; + bool ret = false; + + if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset)) + return false; + func_addr -= offset; /* get function start */ + + spin_lock_irqsave(&report_filterlist_lock, flags); + if (report_filterlist.used == 0) + goto out; + + /* Sort array if it is unsorted, and then do a binary search. */ + if (!report_filterlist.sorted) { + sort(report_filterlist.addrs, report_filterlist.used, + sizeof(unsigned long), cmp_filterlist_addrs, NULL); + report_filterlist.sorted = true; + } + ret = !!bsearch(&func_addr, report_filterlist.addrs, + report_filterlist.used, sizeof(unsigned long), + cmp_filterlist_addrs); + if (report_filterlist.whitelist) + ret = !ret; + +out: + spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; +} + +static void set_report_filterlist_whitelist(bool whitelist) +{ + unsigned long flags; + + spin_lock_irqsave(&report_filterlist_lock, flags); + report_filterlist.whitelist = whitelist; + spin_unlock_irqrestore(&report_filterlist_lock, flags); +} + +/* Returns 0 on success, error-code otherwise. */ +static ssize_t insert_report_filterlist(const char *func) +{ + unsigned long flags; + unsigned long addr = kallsyms_lookup_name(func); + ssize_t ret = 0; + + if (!addr) { + pr_err("KCSAN: could not find function: '%s'\n", func); + return -ENOENT; + } + + spin_lock_irqsave(&report_filterlist_lock, flags); + + if (report_filterlist.addrs == NULL) { + /* initial allocation */ + report_filterlist.addrs = + kmalloc_array(report_filterlist.size, + sizeof(unsigned long), GFP_KERNEL); + if (report_filterlist.addrs == NULL) { + ret = -ENOMEM; + goto out; + } + } else if (report_filterlist.used == report_filterlist.size) { + /* resize filterlist */ + size_t new_size = report_filterlist.size * 2; + unsigned long *new_addrs = + krealloc(report_filterlist.addrs, + new_size * sizeof(unsigned long), GFP_KERNEL); + + if (new_addrs == NULL) { + /* leave filterlist itself untouched */ + ret = -ENOMEM; + goto out; + } + + report_filterlist.size = new_size; + report_filterlist.addrs = new_addrs; + } + + /* Note: deduplicating should be done in userspace. */ + report_filterlist.addrs[report_filterlist.used++] = + kallsyms_lookup_name(func); + report_filterlist.sorted = false; + +out: + spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; +} + +static int show_info(struct seq_file *file, void *v) +{ + int i; + unsigned long flags; + + /* show stats */ + seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); + for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) + seq_printf(file, "%s: %ld\n", counter_to_name(i), + atomic_long_read(&counters[i])); + + /* show filter functions, and filter type */ + spin_lock_irqsave(&report_filterlist_lock, flags); + seq_printf(file, "\n%s functions: %s\n", + report_filterlist.whitelist ? "whitelisted" : "blacklisted", + report_filterlist.used == 0 ? "none" : ""); + for (i = 0; i < report_filterlist.used; ++i) + seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]); + spin_unlock_irqrestore(&report_filterlist_lock, flags); + + return 0; +} + +static int debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_info, NULL); +} + +static ssize_t debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *off) +{ + char kbuf[KSYM_NAME_LEN]; + char *arg; + int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1); + + if (copy_from_user(kbuf, buf, read_len)) + return -EFAULT; + kbuf[read_len] = '\0'; + arg = strstrip(kbuf); + + if (!strcmp(arg, "on")) { + WRITE_ONCE(kcsan_enabled, true); + } else if (!strcmp(arg, "off")) { + WRITE_ONCE(kcsan_enabled, false); + } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) { + unsigned long iters; + + if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) + return -EINVAL; + microbenchmark(iters); + } else if (!strcmp(arg, "whitelist")) { + set_report_filterlist_whitelist(true); + } else if (!strcmp(arg, "blacklist")) { + set_report_filterlist_whitelist(false); + } else if (arg[0] == '!') { + ssize_t ret = insert_report_filterlist(&arg[1]); + + if (ret < 0) + return ret; + } else { + return -EINVAL; + } + + return count; +} + +static const struct file_operations debugfs_ops = { .read = seq_read, + .open = debugfs_open, + .write = debugfs_write, + .release = single_release }; + +void __init kcsan_debugfs_init(void) +{ + debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops); +} diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h new file mode 100644 index 000000000000..e17bdac0e54b --- /dev/null +++ b/kernel/kcsan/encoding.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_KCSAN_ENCODING_H +#define _KERNEL_KCSAN_ENCODING_H + +#include +#include +#include + +#include "kcsan.h" + +#define SLOT_RANGE PAGE_SIZE +#define INVALID_WATCHPOINT 0 +#define CONSUMED_WATCHPOINT 1 + +/* + * The maximum useful size of accesses for which we set up watchpoints is the + * max range of slots we check on an access. + */ +#define MAX_ENCODABLE_SIZE (SLOT_RANGE * (1 + KCSAN_CHECK_ADJACENT)) + +/* + * Number of bits we use to store size info. + */ +#define WATCHPOINT_SIZE_BITS bits_per(MAX_ENCODABLE_SIZE) +/* + * This encoding for addresses discards the upper (1 for is-write + SIZE_BITS); + * however, most 64-bit architectures do not use the full 64-bit address space. + * Also, in order for a false positive to be observable 2 things need to happen: + * + * 1. different addresses but with the same encoded address race; + * 2. and both map onto the same watchpoint slots; + * + * Both these are assumed to be very unlikely. However, in case it still happens + * happens, the report logic will filter out the false positive (see report.c). + */ +#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG - 1 - WATCHPOINT_SIZE_BITS) + +/* + * Masks to set/retrieve the encoded data. + */ +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG - 1) +#define WATCHPOINT_SIZE_MASK \ + GENMASK(BITS_PER_LONG - 2, BITS_PER_LONG - 2 - WATCHPOINT_SIZE_BITS) +#define WATCHPOINT_ADDR_MASK \ + GENMASK(BITS_PER_LONG - 3 - WATCHPOINT_SIZE_BITS, 0) + +static inline bool check_encodable(unsigned long addr, size_t size) +{ + return size <= MAX_ENCODABLE_SIZE; +} + +static inline long encode_watchpoint(unsigned long addr, size_t size, + bool is_write) +{ + return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) | + (size << WATCHPOINT_ADDR_BITS) | + (addr & WATCHPOINT_ADDR_MASK)); +} + +static inline bool decode_watchpoint(long watchpoint, + unsigned long *addr_masked, size_t *size, + bool *is_write) +{ + if (watchpoint == INVALID_WATCHPOINT || + watchpoint == CONSUMED_WATCHPOINT) + return false; + + *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; + *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> + WATCHPOINT_ADDR_BITS; + *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); + + return true; +} + +/* + * Return watchpoint slot for an address. + */ +static inline int watchpoint_slot(unsigned long addr) +{ + return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS; +} + +static inline bool matching_access(unsigned long addr1, size_t size1, + unsigned long addr2, size_t size2) +{ + unsigned long end_range1 = addr1 + size1 - 1; + unsigned long end_range2 = addr2 + size2 - 1; + + return addr1 <= end_range2 && addr2 <= end_range1; +} + +#endif /* _KERNEL_KCSAN_ENCODING_H */ diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h new file mode 100644 index 000000000000..1bb2f1c0d61e --- /dev/null +++ b/kernel/kcsan/kcsan.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please + * see Documentation/dev-tools/kcsan.rst. + */ + +#ifndef _KERNEL_KCSAN_KCSAN_H +#define _KERNEL_KCSAN_KCSAN_H + +#include + +/* The number of adjacent watchpoints to check. */ +#define KCSAN_CHECK_ADJACENT 1 + +/* + * Globally enable and disable KCSAN. + */ +extern bool kcsan_enabled; + +/* + * Initialize debugfs file. + */ +void kcsan_debugfs_init(void); + +enum kcsan_counter_id { + /* + * Number of watchpoints currently in use. + */ + KCSAN_COUNTER_USED_WATCHPOINTS, + + /* + * Total number of watchpoints set up. + */ + KCSAN_COUNTER_SETUP_WATCHPOINTS, + + /* + * Total number of data races. + */ + KCSAN_COUNTER_DATA_RACES, + + /* + * Number of times no watchpoints were available. + */ + KCSAN_COUNTER_NO_CAPACITY, + + /* + * A thread checking a watchpoint raced with another checking thread; + * only one will be reported. + */ + KCSAN_COUNTER_REPORT_RACES, + + /* + * Observed data value change, but writer thread unknown. + */ + KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN, + + /* + * The access cannot be encoded to a valid watchpoint. + */ + KCSAN_COUNTER_UNENCODABLE_ACCESSES, + + /* + * Watchpoint encoding caused a watchpoint to fire on mismatching + * accesses. + */ + KCSAN_COUNTER_ENCODING_FALSE_POSITIVES, + + KCSAN_COUNTER_COUNT, /* number of counters */ +}; + +/* + * Increment/decrement counter with given id; avoid calling these in fast-path. + */ +void kcsan_counter_inc(enum kcsan_counter_id id); +void kcsan_counter_dec(enum kcsan_counter_id id); + +/* + * Returns true if data races in the function symbol that maps to func_addr + * (offsets are ignored) should *not* be reported. + */ +bool kcsan_skip_report_debugfs(unsigned long func_addr); + +enum kcsan_report_type { + /* + * The thread that set up the watchpoint and briefly stalled was + * signalled that another thread triggered the watchpoint. + */ + KCSAN_REPORT_RACE_SIGNAL, + + /* + * A thread found and consumed a matching watchpoint. + */ + KCSAN_REPORT_CONSUMED_WATCHPOINT, + + /* + * No other thread was observed to race with the access, but the data + * value before and after the stall differs. + */ + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, +}; +/* + * Print a race report from thread that encountered the race. + */ +void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type); + +#endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c new file mode 100644 index 000000000000..ead5610bafa7 --- /dev/null +++ b/kernel/kcsan/report.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include "kcsan.h" +#include "encoding.h" + +/* + * Max. number of stack entries to show in the report. + */ +#define NUM_STACK_ENTRIES 64 + +/* + * Other thread info: communicated from other racing thread to thread that set + * up the watchpoint, which then prints the complete report atomically. Only + * need one struct, as all threads should to be serialized regardless to print + * the reports, with reporting being in the slow-path. + */ +static struct { + const volatile void *ptr; + size_t size; + bool is_write; + int task_pid; + int cpu_id; + unsigned long stack_entries[NUM_STACK_ENTRIES]; + int num_stack_entries; +} other_info = { .ptr = NULL }; + +/* + * This spinlock protects reporting and other_info, since other_info is usually + * required when reporting. + */ +static DEFINE_SPINLOCK(report_lock); + +/* + * Special rules to skip reporting. + */ +static bool skip_report(bool is_write, bool value_change, + unsigned long top_frame) +{ + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && + !value_change) { + /* + * The access is a write, but the data value did not change. + * + * We opt-out of this filter for certain functions at request of + * maintainers. + */ + char buf[64]; + + snprintf(buf, sizeof(buf), "%ps", (void *)top_frame); + if (!strnstr(buf, "rcu_", sizeof(buf)) && + !strnstr(buf, "_rcu", sizeof(buf)) && + !strnstr(buf, "_srcu", sizeof(buf))) + return true; + } + + return kcsan_skip_report_debugfs(top_frame); +} + +static inline const char *get_access_type(bool is_write) +{ + return is_write ? "write" : "read"; +} + +/* Return thread description: in task or interrupt. */ +static const char *get_thread_desc(int task_id) +{ + if (task_id != -1) { + static char buf[32]; /* safe: protected by report_lock */ + + snprintf(buf, sizeof(buf), "task %i", task_id); + return buf; + } + return "interrupt"; +} + +/* Helper to skip KCSAN-related functions in stack-trace. */ +static int get_stack_skipnr(unsigned long stack_entries[], int num_entries) +{ + char buf[64]; + int skip = 0; + + for (; skip < num_entries; ++skip) { + snprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); + if (!strnstr(buf, "csan_", sizeof(buf)) && + !strnstr(buf, "tsan_", sizeof(buf)) && + !strnstr(buf, "_once_size", sizeof(buf))) { + break; + } + } + return skip; +} + +/* Compares symbolized strings of addr1 and addr2. */ +static int sym_strcmp(void *addr1, void *addr2) +{ + char buf1[64]; + char buf2[64]; + + snprintf(buf1, sizeof(buf1), "%pS", addr1); + snprintf(buf2, sizeof(buf2), "%pS", addr2); + return strncmp(buf1, buf2, sizeof(buf1)); +} + +/* + * Returns true if a report was generated, false otherwise. + */ +static bool print_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, + enum kcsan_report_type type) +{ + unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; + int num_stack_entries = + stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); + int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + int other_skipnr; + + /* + * Must check report filter rules before starting to print. + */ + if (skip_report(is_write, true, stack_entries[skipnr])) + return false; + + if (type == KCSAN_REPORT_RACE_SIGNAL) { + other_skipnr = get_stack_skipnr(other_info.stack_entries, + other_info.num_stack_entries); + + /* value_change is only known for the other thread */ + if (skip_report(other_info.is_write, value_change, + other_info.stack_entries[other_skipnr])) + return false; + } + + /* Print report header. */ + pr_err("==================================================================\n"); + switch (type) { + case KCSAN_REPORT_RACE_SIGNAL: { + void *this_fn = (void *)stack_entries[skipnr]; + void *other_fn = (void *)other_info.stack_entries[other_skipnr]; + int cmp; + + /* + * Order functions lexographically for consistent bug titles. + * Do not print offset of functions to keep title short. + */ + cmp = sym_strcmp(other_fn, this_fn); + pr_err("BUG: KCSAN: data-race in %ps / %ps\n", + cmp < 0 ? other_fn : this_fn, + cmp < 0 ? this_fn : other_fn); + } break; + + case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: + pr_err("BUG: KCSAN: data-race in %pS\n", + (void *)stack_entries[skipnr]); + break; + + default: + BUG(); + } + + pr_err("\n"); + + /* Print information about the racing accesses. */ + switch (type) { + case KCSAN_REPORT_RACE_SIGNAL: + pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(other_info.is_write), other_info.ptr, + other_info.size, get_thread_desc(other_info.task_pid), + other_info.cpu_id); + + /* Print the other thread's stack trace. */ + stack_trace_print(other_info.stack_entries + other_skipnr, + other_info.num_stack_entries - other_skipnr, + 0); + + pr_err("\n"); + pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(is_write), ptr, size, + get_thread_desc(in_task() ? task_pid_nr(current) : -1), + cpu_id); + break; + + case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: + pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(is_write), ptr, size, + get_thread_desc(in_task() ? task_pid_nr(current) : -1), + cpu_id); + break; + + default: + BUG(); + } + /* Print stack trace of this thread. */ + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + + /* Print report footer. */ + pr_err("\n"); + pr_err("Reported by Kernel Concurrency Sanitizer on:\n"); + dump_stack_print_info(KERN_DEFAULT); + pr_err("==================================================================\n"); + + return true; +} + +static void release_report(unsigned long *flags, enum kcsan_report_type type) +{ + if (type == KCSAN_REPORT_RACE_SIGNAL) + other_info.ptr = NULL; /* mark for reuse */ + + spin_unlock_irqrestore(&report_lock, *flags); +} + +/* + * Depending on the report type either sets other_info and returns false, or + * acquires the matching other_info and returns true. If other_info is not + * required for the report type, simply acquires report_lock and returns true. + */ +static bool prepare_report(unsigned long *flags, const volatile void *ptr, + size_t size, bool is_write, int cpu_id, + enum kcsan_report_type type) +{ + if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && + type != KCSAN_REPORT_RACE_SIGNAL) { + /* other_info not required; just acquire report_lock */ + spin_lock_irqsave(&report_lock, *flags); + return true; + } + +retry: + spin_lock_irqsave(&report_lock, *flags); + + switch (type) { + case KCSAN_REPORT_CONSUMED_WATCHPOINT: + if (other_info.ptr != NULL) + break; /* still in use, retry */ + + other_info.ptr = ptr; + other_info.size = size; + other_info.is_write = is_write; + other_info.task_pid = in_task() ? task_pid_nr(current) : -1; + other_info.cpu_id = cpu_id; + other_info.num_stack_entries = stack_trace_save( + other_info.stack_entries, NUM_STACK_ENTRIES, 1); + + spin_unlock_irqrestore(&report_lock, *flags); + + /* + * The other thread will print the summary; other_info may now + * be consumed. + */ + return false; + + case KCSAN_REPORT_RACE_SIGNAL: + if (other_info.ptr == NULL) + break; /* no data available yet, retry */ + + /* + * First check if this is the other_info we are expecting, i.e. + * matches based on how watchpoint was encoded. + */ + if (!matching_access((unsigned long)other_info.ptr & + WATCHPOINT_ADDR_MASK, + other_info.size, + (unsigned long)ptr & WATCHPOINT_ADDR_MASK, + size)) + break; /* mismatching watchpoint, retry */ + + if (!matching_access((unsigned long)other_info.ptr, + other_info.size, (unsigned long)ptr, + size)) { + /* + * If the actual accesses to not match, this was a false + * positive due to watchpoint encoding. + */ + kcsan_counter_inc( + KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + + /* discard this other_info */ + release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + return false; + } + + /* + * Matching & usable access in other_info: keep other_info_lock + * locked, as this thread consumes it to print the full report; + * unlocked in release_report. + */ + return true; + + default: + BUG(); + } + + spin_unlock_irqrestore(&report_lock, *flags); + goto retry; +} + +void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type) +{ + unsigned long flags = 0; + + kcsan_disable_current(); + if (prepare_report(&flags, ptr, size, is_write, cpu_id, type)) { + if (print_report(ptr, size, is_write, value_change, cpu_id, + type) && + panic_on_warn) + panic("panic_on_warn set ...\n"); + + release_report(&flags, type); + } + kcsan_enable_current(); +} diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c new file mode 100644 index 000000000000..0bae63c5ca65 --- /dev/null +++ b/kernel/kcsan/test.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "encoding.h" + +#define ITERS_PER_TEST 2000 + +/* Test requirements. */ +static bool test_requires(void) +{ + /* random should be initialized for the below tests */ + return prandom_u32() + prandom_u32() != 0; +} + +/* + * Test watchpoint encode and decode: check that encoding some access's info, + * and then subsequent decode preserves the access's info. + */ +static bool test_encode_decode(void) +{ + int i; + + for (i = 0; i < ITERS_PER_TEST; ++i) { + size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; + bool is_write = !!prandom_u32_max(2); + unsigned long addr; + + prandom_bytes(&addr, sizeof(addr)); + if (WARN_ON(!check_encodable(addr, size))) + return false; + + /* encode and decode */ + { + const long encoded_watchpoint = + encode_watchpoint(addr, size, is_write); + unsigned long verif_masked_addr; + size_t verif_size; + bool verif_is_write; + + /* check special watchpoints */ + if (WARN_ON(decode_watchpoint( + INVALID_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(decode_watchpoint( + CONSUMED_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + + /* check decoding watchpoint returns same data */ + if (WARN_ON(!decode_watchpoint( + encoded_watchpoint, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(verif_masked_addr != + (addr & WATCHPOINT_ADDR_MASK))) + goto fail; + if (WARN_ON(verif_size != size)) + goto fail; + if (WARN_ON(is_write != verif_is_write)) + goto fail; + + continue; +fail: + pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n", + __func__, is_write ? "write" : "read", size, + addr, encoded_watchpoint, + verif_is_write ? "write" : "read", verif_size, + verif_masked_addr); + return false; + } + } + + return true; +} + +/* Test access matching function. */ +static bool test_matching_access(void) +{ + if (WARN_ON(!matching_access(10, 1, 10, 1))) + return false; + if (WARN_ON(!matching_access(10, 2, 11, 1))) + return false; + if (WARN_ON(!matching_access(10, 1, 9, 2))) + return false; + if (WARN_ON(matching_access(10, 1, 11, 1))) + return false; + if (WARN_ON(matching_access(9, 1, 10, 1))) + return false; + return true; +} + +static int __init kcsan_selftest(void) +{ + int passed = 0; + int total = 0; + +#define RUN_TEST(do_test) \ + do { \ + ++total; \ + if (do_test()) \ + ++passed; \ + else \ + pr_err("KCSAN selftest: " #do_test " failed"); \ + } while (0) + + RUN_TEST(test_requires); + RUN_TEST(test_encode_decode); + RUN_TEST(test_matching_access); + + pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + if (passed != total) + panic("KCSAN selftests failed"); + return 0; +} +postcore_initcall(kcsan_selftest); -- cgit v1.2.3 From 0ebba7141eadc4804ec5b4bb5106331b0c3abf4c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:02:58 +0100 Subject: build, kcsan: Add KCSAN build exceptions This blacklists several compilation units from KCSAN. See the respective inline comments for the reasoning. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/Makefile | 5 +++++ kernel/sched/Makefile | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 74ab46e2ebd1..cc53f7c25446 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,9 @@ endif # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() # in coverage traces. KCOV_INSTRUMENT_softirq.o := n +# Avoid KCSAN instrumentation in softirq ("No shared variables, all the data +# are CPU local" => assume no data races), to reduce overhead in interrupts. +KCSAN_SANITIZE_softirq.o = n # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. KCOV_INSTRUMENT_module.o := n @@ -30,6 +33,7 @@ KCOV_INSTRUMENT_extable.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n +KCSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) # cond_syscall is currently not LTO compatible @@ -118,6 +122,7 @@ obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n +KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n $(obj)/configs.o: $(obj)/config_data.gz diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 21fb5a5662b5..e9307a9c54e7 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -7,6 +7,12 @@ endif # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n +# There are numerous races here, however, most of them due to plain accesses. +# This would make it even harder for syzbot to find reproducers, because these +# bugs trigger without specific input. Disable by default, but should re-enable +# eventually. +KCSAN_SANITIZE := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond -- cgit v1.2.3 From 5cbaefe9743bf14c9d3106db0cc19f8cb0a3ca22 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Nov 2019 10:41:43 +0100 Subject: kcsan: Improve various small stylistic details Tidy up a few bits: - Fix typos and grammar, improve wording. - Remove spurious newlines that are col80 warning artifacts where the resulting line-break is worse than the disease it's curing. - Use core kernel coding style to improve readability and reduce spurious code pattern variations. - Use better vertical alignment for structure definitions and initialization sequences. - Misc other small details. No change in functionality intended. Cc: linux-kernel@vger.kernel.org Cc: Marco Elver Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Will Deacon Signed-off-by: Ingo Molnar --- kernel/kcsan/atomic.h | 2 +- kernel/kcsan/core.c | 59 +++++++++++++++++++++------------------------- kernel/kcsan/debugfs.c | 62 +++++++++++++++++++++++-------------------------- kernel/kcsan/encoding.h | 25 ++++++++++---------- kernel/kcsan/kcsan.h | 11 +++++---- kernel/kcsan/report.c | 42 ++++++++++++++++----------------- kernel/kcsan/test.c | 6 ++--- kernel/sched/Makefile | 2 +- 8 files changed, 100 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index c9c3fe628011..576e03ddd6a3 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -6,7 +6,7 @@ #include /* - * Helper that returns true if access to ptr should be considered as an atomic + * Helper that returns true if access to @ptr should be considered an atomic * access, even though it is not explicitly atomic. * * List all volatile globals that have been observed in races, to suppress diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index d9410d58c93e..3314fc29e236 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -19,10 +19,10 @@ bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { - .disable_count = 0, - .atomic_next = 0, - .atomic_nest_count = 0, - .in_flat_atomic = false, + .disable_count = 0, + .atomic_next = 0, + .atomic_nest_count = 0, + .in_flat_atomic = false, }; /* @@ -50,11 +50,11 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { * slot=9: [10, 11, 9] * slot=63: [64, 65, 63] */ -#define NUM_SLOTS (1 + 2 * KCSAN_CHECK_ADJACENT) +#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) #define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) /* - * SLOT_IDX_FAST is used in fast-path. Not first checking the address's primary + * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary * slot (middle) is fine if we assume that data races occur rarely. The set of * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. @@ -68,9 +68,9 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { * zero-initialized state matches INVALID_WATCHPOINT. * * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to - * use more complicated SLOT_IDX_FAST calculation with modulo in fast-path. + * use more complicated SLOT_IDX_FAST calculation with modulo in the fast-path. */ -static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS - 1]; +static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; /* * Instructions to skip watching counter, used in should_watch(). We use a @@ -78,7 +78,8 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS - 1]; */ static DEFINE_PER_CPU(long, kcsan_skip); -static inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, +static inline atomic_long_t *find_watchpoint(unsigned long addr, + size_t size, bool expect_write, long *encoded_watchpoint) { @@ -110,8 +111,8 @@ static inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, return NULL; } -static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, - bool is_write) +static inline atomic_long_t * +insert_watchpoint(unsigned long addr, size_t size, bool is_write) { const int slot = watchpoint_slot(addr); const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); @@ -120,21 +121,16 @@ static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, /* Check slot index logic, ensuring we stay within array bounds. */ BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT); - BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT + 1) != 0); - BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, - KCSAN_CHECK_ADJACENT) != - ARRAY_SIZE(watchpoints) - 1); - BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, - KCSAN_CHECK_ADJACENT + 1) != - ARRAY_SIZE(watchpoints) - NUM_SLOTS); + BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT+1) != 0); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT) != ARRAY_SIZE(watchpoints)-1); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT+1) != ARRAY_SIZE(watchpoints) - NUM_SLOTS); for (i = 0; i < NUM_SLOTS; ++i) { long expect_val = INVALID_WATCHPOINT; /* Try to acquire this slot. */ watchpoint = &watchpoints[SLOT_IDX(slot, i)]; - if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, - encoded_watchpoint)) + if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, encoded_watchpoint)) return watchpoint; } @@ -150,11 +146,10 @@ static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, * 2. the thread that set up the watchpoint already removed it; * 3. the watchpoint was removed and then re-used. */ -static inline bool try_consume_watchpoint(atomic_long_t *watchpoint, - long encoded_watchpoint) +static inline bool +try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) { - return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, - CONSUMED_WATCHPOINT); + return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); } /* @@ -162,14 +157,13 @@ static inline bool try_consume_watchpoint(atomic_long_t *watchpoint, */ static inline bool remove_watchpoint(atomic_long_t *watchpoint) { - return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != - CONSUMED_WATCHPOINT; + return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; } static inline struct kcsan_ctx *get_ctx(void) { /* - * In interrupt, use raw_cpu_ptr to avoid unnecessary checks, that would + * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would * also result in calls that generate warnings in uaccess regions. */ return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); @@ -260,7 +254,8 @@ static inline unsigned int get_delay(void) */ static noinline void kcsan_found_watchpoint(const volatile void *ptr, - size_t size, bool is_write, + size_t size, + bool is_write, atomic_long_t *watchpoint, long encoded_watchpoint) { @@ -296,8 +291,8 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, user_access_restore(flags); } -static noinline void kcsan_setup_watchpoint(const volatile void *ptr, - size_t size, bool is_write) +static noinline void +kcsan_setup_watchpoint(const volatile void *ptr, size_t size, bool is_write) { atomic_long_t *watchpoint; union { @@ -346,8 +341,8 @@ static noinline void kcsan_setup_watchpoint(const volatile void *ptr, watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { /* - * Out of capacity: the size of `watchpoints`, and the frequency - * with which `should_watch()` returns true should be tweaked so + * Out of capacity: the size of 'watchpoints', and the frequency + * with which should_watch() returns true should be tweaked so * that this case happens very rarely. */ kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 041d520a0183..bec42dab32ee 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -24,39 +24,31 @@ static atomic_long_t counters[KCSAN_COUNTER_COUNT]; * whitelist or blacklist. */ static struct { - unsigned long *addrs; /* array of addresses */ - size_t size; /* current size */ - int used; /* number of elements used */ - bool sorted; /* if elements are sorted */ - bool whitelist; /* if list is a blacklist or whitelist */ + unsigned long *addrs; /* array of addresses */ + size_t size; /* current size */ + int used; /* number of elements used */ + bool sorted; /* if elements are sorted */ + bool whitelist; /* if list is a blacklist or whitelist */ } report_filterlist = { - .addrs = NULL, - .size = 8, /* small initial size */ - .used = 0, - .sorted = false, - .whitelist = false, /* default is blacklist */ + .addrs = NULL, + .size = 8, /* small initial size */ + .used = 0, + .sorted = false, + .whitelist = false, /* default is blacklist */ }; static DEFINE_SPINLOCK(report_filterlist_lock); static const char *counter_to_name(enum kcsan_counter_id id) { switch (id) { - case KCSAN_COUNTER_USED_WATCHPOINTS: - return "used_watchpoints"; - case KCSAN_COUNTER_SETUP_WATCHPOINTS: - return "setup_watchpoints"; - case KCSAN_COUNTER_DATA_RACES: - return "data_races"; - case KCSAN_COUNTER_NO_CAPACITY: - return "no_capacity"; - case KCSAN_COUNTER_REPORT_RACES: - return "report_races"; - case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: - return "races_unknown_origin"; - case KCSAN_COUNTER_UNENCODABLE_ACCESSES: - return "unencodable_accesses"; - case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: - return "encoding_false_positives"; + case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; + case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; + case KCSAN_COUNTER_DATA_RACES: return "data_races"; + case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; + case KCSAN_COUNTER_REPORT_RACES: return "report_races"; + case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; + case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; + case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; case KCSAN_COUNTER_COUNT: BUG(); } @@ -116,7 +108,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr) if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset)) return false; - func_addr -= offset; /* get function start */ + func_addr -= offset; /* Get function start */ spin_lock_irqsave(&report_filterlist_lock, flags); if (report_filterlist.used == 0) @@ -195,6 +187,7 @@ static ssize_t insert_report_filterlist(const char *func) out: spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; } @@ -226,8 +219,8 @@ static int debugfs_open(struct inode *inode, struct file *file) return single_open(file, show_info, NULL); } -static ssize_t debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *off) +static ssize_t +debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *off) { char kbuf[KSYM_NAME_LEN]; char *arg; @@ -264,10 +257,13 @@ static ssize_t debugfs_write(struct file *file, const char __user *buf, return count; } -static const struct file_operations debugfs_ops = { .read = seq_read, - .open = debugfs_open, - .write = debugfs_write, - .release = single_release }; +static const struct file_operations debugfs_ops = +{ + .read = seq_read, + .open = debugfs_open, + .write = debugfs_write, + .release = single_release +}; void __init kcsan_debugfs_init(void) { diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index e17bdac0e54b..b63890e86449 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -10,7 +10,8 @@ #include "kcsan.h" #define SLOT_RANGE PAGE_SIZE -#define INVALID_WATCHPOINT 0 + +#define INVALID_WATCHPOINT 0 #define CONSUMED_WATCHPOINT 1 /* @@ -34,24 +35,24 @@ * Both these are assumed to be very unlikely. However, in case it still happens * happens, the report logic will filter out the false positive (see report.c). */ -#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG - 1 - WATCHPOINT_SIZE_BITS) +#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) /* * Masks to set/retrieve the encoded data. */ -#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG - 1) +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) #define WATCHPOINT_SIZE_MASK \ - GENMASK(BITS_PER_LONG - 2, BITS_PER_LONG - 2 - WATCHPOINT_SIZE_BITS) + GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS) #define WATCHPOINT_ADDR_MASK \ - GENMASK(BITS_PER_LONG - 3 - WATCHPOINT_SIZE_BITS, 0) + GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0) static inline bool check_encodable(unsigned long addr, size_t size) { return size <= MAX_ENCODABLE_SIZE; } -static inline long encode_watchpoint(unsigned long addr, size_t size, - bool is_write) +static inline long +encode_watchpoint(unsigned long addr, size_t size, bool is_write) { return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) | (size << WATCHPOINT_ADDR_BITS) | @@ -59,17 +60,17 @@ static inline long encode_watchpoint(unsigned long addr, size_t size, } static inline bool decode_watchpoint(long watchpoint, - unsigned long *addr_masked, size_t *size, + unsigned long *addr_masked, + size_t *size, bool *is_write) { if (watchpoint == INVALID_WATCHPOINT || watchpoint == CONSUMED_WATCHPOINT) return false; - *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; - *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> - WATCHPOINT_ADDR_BITS; - *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); + *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; + *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> WATCHPOINT_ADDR_BITS; + *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); return true; } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 1bb2f1c0d61e..d3b9a96ac8a4 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -72,14 +72,14 @@ enum kcsan_counter_id { /* * Increment/decrement counter with given id; avoid calling these in fast-path. */ -void kcsan_counter_inc(enum kcsan_counter_id id); -void kcsan_counter_dec(enum kcsan_counter_id id); +extern void kcsan_counter_inc(enum kcsan_counter_id id); +extern void kcsan_counter_dec(enum kcsan_counter_id id); /* * Returns true if data races in the function symbol that maps to func_addr * (offsets are ignored) should *not* be reported. */ -bool kcsan_skip_report_debugfs(unsigned long func_addr); +extern bool kcsan_skip_report_debugfs(unsigned long func_addr); enum kcsan_report_type { /* @@ -99,10 +99,11 @@ enum kcsan_report_type { */ KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, }; + /* * Print a race report from thread that encountered the race. */ -void kcsan_report(const volatile void *ptr, size_t size, bool is_write, - bool value_change, int cpu_id, enum kcsan_report_type type); +extern void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ead5610bafa7..0eea05a3135b 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -22,13 +22,13 @@ * the reports, with reporting being in the slow-path. */ static struct { - const volatile void *ptr; - size_t size; - bool is_write; - int task_pid; - int cpu_id; - unsigned long stack_entries[NUM_STACK_ENTRIES]; - int num_stack_entries; + const volatile void *ptr; + size_t size; + bool is_write; + int task_pid; + int cpu_id; + unsigned long stack_entries[NUM_STACK_ENTRIES]; + int num_stack_entries; } other_info = { .ptr = NULL }; /* @@ -40,8 +40,8 @@ static DEFINE_SPINLOCK(report_lock); /* * Special rules to skip reporting. */ -static bool skip_report(bool is_write, bool value_change, - unsigned long top_frame) +static bool +skip_report(bool is_write, bool value_change, unsigned long top_frame) { if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && !value_change) { @@ -105,6 +105,7 @@ static int sym_strcmp(void *addr1, void *addr2) snprintf(buf1, sizeof(buf1), "%pS", addr1); snprintf(buf2, sizeof(buf2), "%pS", addr2); + return strncmp(buf1, buf2, sizeof(buf1)); } @@ -116,8 +117,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, enum kcsan_report_type type) { unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; - int num_stack_entries = - stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); + int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); int other_skipnr; @@ -131,7 +131,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, other_skipnr = get_stack_skipnr(other_info.stack_entries, other_info.num_stack_entries); - /* value_change is only known for the other thread */ + /* @value_change is only known for the other thread */ if (skip_report(other_info.is_write, value_change, other_info.stack_entries[other_skipnr])) return false; @@ -241,13 +241,12 @@ retry: if (other_info.ptr != NULL) break; /* still in use, retry */ - other_info.ptr = ptr; - other_info.size = size; - other_info.is_write = is_write; - other_info.task_pid = in_task() ? task_pid_nr(current) : -1; - other_info.cpu_id = cpu_id; - other_info.num_stack_entries = stack_trace_save( - other_info.stack_entries, NUM_STACK_ENTRIES, 1); + other_info.ptr = ptr; + other_info.size = size; + other_info.is_write = is_write; + other_info.task_pid = in_task() ? task_pid_nr(current) : -1; + other_info.cpu_id = cpu_id; + other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); spin_unlock_irqrestore(&report_lock, *flags); @@ -299,6 +298,7 @@ retry: } spin_unlock_irqrestore(&report_lock, *flags); + goto retry; } @@ -309,9 +309,7 @@ void kcsan_report(const volatile void *ptr, size_t size, bool is_write, kcsan_disable_current(); if (prepare_report(&flags, ptr, size, is_write, cpu_id, type)) { - if (print_report(ptr, size, is_write, value_change, cpu_id, - type) && - panic_on_warn) + if (print_report(ptr, size, is_write, value_change, cpu_id, type) && panic_on_warn) panic("panic_on_warn set ...\n"); release_report(&flags, type); diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c index 0bae63c5ca65..cc6000239dc0 100644 --- a/kernel/kcsan/test.c +++ b/kernel/kcsan/test.c @@ -34,7 +34,7 @@ static bool test_encode_decode(void) if (WARN_ON(!check_encodable(addr, size))) return false; - /* encode and decode */ + /* Encode and decode */ { const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); @@ -42,7 +42,7 @@ static bool test_encode_decode(void) size_t verif_size; bool verif_is_write; - /* check special watchpoints */ + /* Check special watchpoints */ if (WARN_ON(decode_watchpoint( INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write))) @@ -52,7 +52,7 @@ static bool test_encode_decode(void) &verif_size, &verif_is_write))) return false; - /* check decoding watchpoint returns same data */ + /* Check decoding watchpoint returns same data */ if (WARN_ON(!decode_watchpoint( encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write))) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e9307a9c54e7..5fc9c9b70862 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -7,7 +7,7 @@ endif # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n -# There are numerous races here, however, most of them due to plain accesses. +# There are numerous data races here, however, most of them are due to plain accesses. # This would make it even harder for syzbot to find reproducers, because these # bugs trigger without specific input. Disable by default, but should re-enable # eventually. -- cgit v1.2.3 From d47715f50e833f12c5e829ce9dcc4a65104fa74f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 19 Nov 2019 19:57:42 +0100 Subject: kcsan, ubsan: Make KCSAN+UBSAN work together Context: http://lkml.kernel.org/r/fb7e25d8-aba4-3dcf-7761-cb7ecb3ebb71@infradead.org Reported-by: Randy Dunlap Signed-off-by: Marco Elver Acked-by: Randy Dunlap # build-tested Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index dd15b62ec0b5..df6b7799e492 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 KCSAN_SANITIZE := n KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) -- cgit v1.2.3 From 5661dd95a2958634485bb1a53f90a6ab621d4b0c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Jan 2020 15:16:44 -0700 Subject: printk: Convert a use of sprintf to snprintf in console_unlock When CONFIG_PRINTK is disabled (e.g. when building allnoconfig), clang warns: ../kernel/printk/printk.c:2416:10: warning: 'sprintf' will always overflow; destination buffer has size 0, but format string expands to at least 33 [-Wfortify-source] len = sprintf(text, ^ 1 warning generated. It is not wrong; text has a zero size when CONFIG_PRINTK is disabled because LOG_LINE_MAX and PREFIX_MAX are both zero. Change to snprintf so that this case is explicitly handled without any risk of overflow. Link: https://github.com/ClangBuiltLinux/linux/issues/846 Link: https://github.com/llvm/llvm-project/commit/6d485ff455ea2b37fef9e06e426dae6c1241b231 Link: http://lkml.kernel.org/r/20200130221644.2273-1-natechancellor@gmail.com Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Cc: clang-built-linux@googlegroups.com Signed-off-by: Nathan Chancellor Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fada22dc4ab6..a44094727a5c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2413,9 +2413,9 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); + len = snprintf(text, sizeof(text), + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; -- cgit v1.2.3 From ad8cd1db80cc33337bdbee63c453ef6d5132474b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 13 Feb 2020 10:51:31 +0100 Subject: printk: Move console matching logic into a separate function This moves the loop that tries to match a newly registered console with the command line or add_preferred_console list into a separate helper, in order to be able to call it multiple times in subsequent patches. Link: https://lore.kernel.org/r/20200213095133.23176-2-pmladek@suse.com Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 105 ++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fada22dc4ab6..0ebcdf53e75d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,6 +280,7 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; +static bool has_preferred_console; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -2626,6 +2627,60 @@ static int __init keep_bootcon_setup(char *str) early_param("keep_bootcon", keep_bootcon_setup); +/* + * This is called by register_console() to try to match + * the newly registered console with any of the ones selected + * by either the command line or add_preferred_console() and + * setup/enable it. + * + * Care need to be taken with consoles that are statically + * enabled such as netconsole + */ +static int try_enable_new_console(struct console *newcon) +{ + struct console_cmdline *c; + int i; + + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (!newcon->match || + newcon->match(newcon, c->name, c->index, c->options) != 0) { + /* default matching */ + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); + if (strcmp(c->name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != c->index) + continue; + if (newcon->index < 0) + newcon->index = c->index; + + if (_braille_register_console(newcon, c)) + return 0; + + if (newcon->setup && + newcon->setup(newcon, c->options) != 0) + return -EIO; + } + newcon->flags |= CON_ENABLED; + if (i == preferred_console) { + newcon->flags |= CON_CONSDEV; + has_preferred_console = true; + } + return 0; + } + + /* + * Some consoles, such as pstore and netconsole, can be enabled even + * without matching. + */ + if (newcon->flags & CON_ENABLED) + return 0; + + return -ENOENT; +} + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -2647,11 +2702,9 @@ early_param("keep_bootcon", keep_bootcon_setup); */ void register_console(struct console *newcon) { - int i; unsigned long flags; struct console *bcon = NULL; - struct console_cmdline *c; - static bool has_preferred; + int err; if (console_drivers) for_each_console(bcon) @@ -2678,15 +2731,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (!has_preferred || bcon || !console_drivers) - has_preferred = preferred_console >= 0; + if (!has_preferred_console || bcon || !console_drivers) + has_preferred_console = preferred_console >= 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (!has_preferred) { + if (!has_preferred_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -2694,47 +2747,19 @@ void register_console(struct console *newcon) newcon->flags |= CON_ENABLED; if (newcon->device) { newcon->flags |= CON_CONSDEV; - has_preferred = true; + has_preferred_console = true; } } } /* - * See if this console matches one we selected on - * the command line. + * See if this console matches one we selected on + * the command line or if it was statically enabled */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { - if (!newcon->match || - newcon->match(newcon, c->name, c->index, c->options) != 0) { - /* default matching */ - BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); - if (strcmp(c->name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != c->index) - continue; - if (newcon->index < 0) - newcon->index = c->index; - - if (_braille_register_console(newcon, c)) - return; - - if (newcon->setup && - newcon->setup(newcon, c->options) != 0) - break; - } - - newcon->flags |= CON_ENABLED; - if (i == preferred_console) { - newcon->flags |= CON_CONSDEV; - has_preferred = true; - } - break; - } + err = try_enable_new_console(newcon); - if (!(newcon->flags & CON_ENABLED)) + /* printk() messages are not printed to the Braille console. */ + if (err || newcon->flags & CON_BRL) return; /* -- cgit v1.2.3 From e369d8227fd211be36242fc44a9dc2209e246b9a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 13 Feb 2020 10:51:32 +0100 Subject: printk: Fix preferred console selection with multiple matches In the following circumstances, the rule of selecting the console corresponding to the last "console=" entry on the command line as the preferred console (CON_CONSDEV, ie, /dev/console) fails. This is a specific example, but it could happen with different consoles that have a similar name aliasing mechanism. - The kernel command line has both console=tty0 and console=ttyS0 in that order (the latter with speed etc... arguments). This is common with some cloud setups such as Amazon Linux. - add_preferred_console is called early to register "uart0". In our case that happens from acpi_parse_spcr() on arm64 since the "enable_console" argument is true on that architecture. This causes "uart0" to become entry 0 of the console_cmdline array. Now, because of the above, what happens is: - add_preferred_console is called by the cmdline parsing for tty0 and ttyS0 respectively, thus occupying entries 1 and 2 of the console_cmdline array (since this happens after ACPI SPCR parsing). At that point preferred_console is set to 2 as expected. - When the tty layer kicks in, it will call register_console for tty0. This will match entry 1 in console_cmdline array. It isn't our preferred console but because it's our only console at this point, it will end up "first" in the consoles list. - When 8250 probes the actual serial port later on, it calls register_console for ttyS0. At that point the loop in register_console tries to match it with the entries in the console_cmdline array. Ideally this should match ttyS0 in entry 2, which is preferred, causing it to be inserted first and to replace tty0 as CONSDEV. However, 8250 provides a "match" hook in its struct console, and that hook will match "uart" as an alias to "ttyS". So we match uart0 at entry 0 in the array which is not the preferred console and will not match entry 2 which is since we break out of the loop on the first match. As a result, we don't set CONSDEV and don't insert it first, but second in the console list. As a result, we end up with tty0 remaining first in the array, and thus /dev/console going there instead of the last user specified one which is ttyS0. This tentative fix register_console() to scan first for consoles specified on the command line, and only if none is found, to then scan for consoles specified by the architecture. Link: https://lore.kernel.org/r/20200213095133.23176-3-pmladek@suse.com Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/console_cmdline.h | 1 + kernel/printk/printk.c | 29 ++++++++++++++++++----------- 2 files changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index 11f19c466af5..3ca74ad391d6 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -6,6 +6,7 @@ struct console_cmdline { char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ + bool user_specified; /* Specified by command line vs. platform */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE char *brl_options; /* Options for braille driver */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0ebcdf53e75d..f76ef3f0efca 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2116,7 +2116,7 @@ asmlinkage __visible void early_printk(const char *fmt, ...) #endif static int __add_preferred_console(char *name, int idx, char *options, - char *brl_options) + char *brl_options, bool user_specified) { struct console_cmdline *c; int i; @@ -2131,6 +2131,8 @@ static int __add_preferred_console(char *name, int idx, char *options, if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; + if (user_specified) + c->user_specified = true; return 0; } } @@ -2140,6 +2142,7 @@ static int __add_preferred_console(char *name, int idx, char *options, preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; + c->user_specified = user_specified; braille_set_options(c, brl_options); c->index = idx; @@ -2194,7 +2197,7 @@ static int __init console_setup(char *str) idx = simple_strtoul(s, NULL, 10); *s = 0; - __add_preferred_console(buf, idx, options, brl_options); + __add_preferred_console(buf, idx, options, brl_options, true); console_set_on_cmdline = 1; return 1; } @@ -2215,7 +2218,7 @@ __setup("console=", console_setup); */ int add_preferred_console(char *name, int idx, char *options) { - return __add_preferred_console(name, idx, options, NULL); + return __add_preferred_console(name, idx, options, NULL, false); } bool console_suspend_enabled = true; @@ -2636,7 +2639,7 @@ early_param("keep_bootcon", keep_bootcon_setup); * Care need to be taken with consoles that are statically * enabled such as netconsole */ -static int try_enable_new_console(struct console *newcon) +static int try_enable_new_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i; @@ -2644,6 +2647,8 @@ static int try_enable_new_console(struct console *newcon) for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { + if (c->user_specified != user_specified) + continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ @@ -2673,9 +2678,10 @@ static int try_enable_new_console(struct console *newcon) /* * Some consoles, such as pstore and netconsole, can be enabled even - * without matching. + * without matching. Accept the pre-enabled consoles only when match() + * and setup() had a change to be called. */ - if (newcon->flags & CON_ENABLED) + if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; @@ -2752,11 +2758,12 @@ void register_console(struct console *newcon) } } - /* - * See if this console matches one we selected on - * the command line or if it was statically enabled - */ - err = try_enable_new_console(newcon); + /* See if this console matches one we selected on the command line */ + err = try_enable_new_console(newcon, true); + + /* If not, try to match against the platform default(s) */ + if (err == -ENOENT) + err = try_enable_new_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) -- cgit v1.2.3 From 33225d7b0ac9903c5701bbede7dfdaeba74ad6c3 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 13 Feb 2020 10:51:33 +0100 Subject: printk: Correctly set CON_CONSDEV even when preferred console was not registered CON_CONSDEV flag was historically used to put/keep the preferred console first in console_drivers list. Where the preferred console is the last on the command line. The ordering is important only when opening /dev/console: + tty_kopen() + tty_lookup_driver() + console_device() The flag was originally an implementation detail. But it was later made accessible from userspace via /proc/consoles. It was used, for example, by the tool "showconsole" to show the real tty accessible via /dev/console, see https://github.com/bitstreamout/showconsole Now, the current code sets CON_CONSDEV only for the preferred console or when a fallback console is added. The flag is not set when the preferred console is defined on the command line but it is not registered from some reasons. Simple solution is to set CON_CONSDEV flag for the first registered console. It will work most of the time because: + Most real consoles have console->device defined. + Boot consoles are removed in printk_late_init(). + unregister_console() moves CON_CONSDEV flag to the next console. Clean solution would require checking con->device when the preferred console is registered and in unregister_console(). Conclusion: Use the simple solution for now. It is better than the current state and good enough. The clean solution is not worth it. It would complicate the already complicated code without too much gain. Instead the code would deserve a complete rewrite. Link: https://lore.kernel.org/r/20200213095133.23176-4-pmladek@suse.com Signed-off-by: Benjamin Herrenschmidt [pmladek@suse.com: Correct reasoning in the commit message, comment update.] Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f76ef3f0efca..cf0ceacdae2f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2788,6 +2788,8 @@ void register_console(struct console *newcon) console_drivers = newcon; if (newcon->next) newcon->next->flags &= ~CON_CONSDEV; + /* Ensure this flag is always set for the head of the list */ + newcon->flags |= CON_CONSDEV; } else { newcon->next = console_drivers->next; console_drivers->next = newcon; -- cgit v1.2.3 From 5c361425744d1e3b03d835dde659708683ca27d1 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 7 Jan 2020 17:31:04 +0100 Subject: kcsan: Prefer __always_inline for fast-path Prefer __always_inline for fast-path functions that are called outside of user_access_save, to avoid generating UACCESS warnings when optimizing for size (CC_OPTIMIZE_FOR_SIZE). It will also avoid future surprises with compiler versions that change the inlining heuristic even when optimizing for performance. Reported-by: Randy Dunlap Acked-by: Randy Dunlap # build-tested Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/58708908-84a0-0a81-a836-ad97e33dbb62@infradead.org --- kernel/kcsan/atomic.h | 2 +- kernel/kcsan/core.c | 18 +++++++++--------- kernel/kcsan/encoding.h | 14 +++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index 576e03ddd6a3..a9c193053491 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -18,7 +18,7 @@ * than cast to volatile. Eventually, we hope to be able to remove this * function. */ -static inline bool kcsan_is_atomic(const volatile void *ptr) +static __always_inline bool kcsan_is_atomic(const volatile void *ptr) { /* only jiffies for now */ return ptr == &jiffies; diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3314fc29e236..4d4ab5c5dc53 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -78,10 +78,10 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; */ static DEFINE_PER_CPU(long, kcsan_skip); -static inline atomic_long_t *find_watchpoint(unsigned long addr, - size_t size, - bool expect_write, - long *encoded_watchpoint) +static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, + size_t size, + bool expect_write, + long *encoded_watchpoint) { const int slot = watchpoint_slot(addr); const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK; @@ -146,7 +146,7 @@ insert_watchpoint(unsigned long addr, size_t size, bool is_write) * 2. the thread that set up the watchpoint already removed it; * 3. the watchpoint was removed and then re-used. */ -static inline bool +static __always_inline bool try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) { return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); @@ -160,7 +160,7 @@ static inline bool remove_watchpoint(atomic_long_t *watchpoint) return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; } -static inline struct kcsan_ctx *get_ctx(void) +static __always_inline struct kcsan_ctx *get_ctx(void) { /* * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would @@ -169,7 +169,7 @@ static inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } -static inline bool is_atomic(const volatile void *ptr) +static __always_inline bool is_atomic(const volatile void *ptr) { struct kcsan_ctx *ctx = get_ctx(); @@ -193,7 +193,7 @@ static inline bool is_atomic(const volatile void *ptr) return kcsan_is_atomic(ptr); } -static inline bool should_watch(const volatile void *ptr, int type) +static __always_inline bool should_watch(const volatile void *ptr, int type) { /* * Never set up watchpoints when memory operations are atomic. @@ -226,7 +226,7 @@ static inline void reset_kcsan_skip(void) this_cpu_write(kcsan_skip, skip_count); } -static inline bool kcsan_is_enabled(void) +static __always_inline bool kcsan_is_enabled(void) { return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index b63890e86449..f03562aaf2eb 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -59,10 +59,10 @@ encode_watchpoint(unsigned long addr, size_t size, bool is_write) (addr & WATCHPOINT_ADDR_MASK)); } -static inline bool decode_watchpoint(long watchpoint, - unsigned long *addr_masked, - size_t *size, - bool *is_write) +static __always_inline bool decode_watchpoint(long watchpoint, + unsigned long *addr_masked, + size_t *size, + bool *is_write) { if (watchpoint == INVALID_WATCHPOINT || watchpoint == CONSUMED_WATCHPOINT) @@ -78,13 +78,13 @@ static inline bool decode_watchpoint(long watchpoint, /* * Return watchpoint slot for an address. */ -static inline int watchpoint_slot(unsigned long addr) +static __always_inline int watchpoint_slot(unsigned long addr) { return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS; } -static inline bool matching_access(unsigned long addr1, size_t size1, - unsigned long addr2, size_t size2) +static __always_inline bool matching_access(unsigned long addr1, size_t size1, + unsigned long addr2, size_t size2) { unsigned long end_range1 = addr1 + size1 - 1; unsigned long end_range2 = addr2 + size2 - 1; -- cgit v1.2.3 From 47144eca282189afcf34ef25aee8408c168765d4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 10 Jan 2020 19:48:33 +0100 Subject: kcsan: Show full access type in report This commit adds access-type information to KCSAN's reports as follows: "read", "read (marked)", "write", and "write (marked)". Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 15 ++++++++------- kernel/kcsan/kcsan.h | 2 +- kernel/kcsan/report.c | 43 ++++++++++++++++++++++++++++--------------- 3 files changed, 37 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4d4ab5c5dc53..87bf857c8893 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -255,7 +255,7 @@ static inline unsigned int get_delay(void) static noinline void kcsan_found_watchpoint(const volatile void *ptr, size_t size, - bool is_write, + int type, atomic_long_t *watchpoint, long encoded_watchpoint) { @@ -276,7 +276,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { - kcsan_report(ptr, size, is_write, true, raw_smp_processor_id(), + kcsan_report(ptr, size, type, true, raw_smp_processor_id(), KCSAN_REPORT_CONSUMED_WATCHPOINT); } else { /* @@ -292,8 +292,9 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, } static noinline void -kcsan_setup_watchpoint(const volatile void *ptr, size_t size, bool is_write) +kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) { + const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; atomic_long_t *watchpoint; union { u8 _1; @@ -415,13 +416,13 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, bool is_write) * No need to increment 'data_races' counter, as the racing * thread already did. */ - kcsan_report(ptr, size, is_write, size > 8 || value_change, + kcsan_report(ptr, size, type, size > 8 || value_change, smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); } else if (value_change) { /* Inferring a race, since the value should not have changed. */ kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) - kcsan_report(ptr, size, is_write, true, + kcsan_report(ptr, size, type, true, smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); } @@ -455,10 +456,10 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, */ if (unlikely(watchpoint != NULL)) - kcsan_found_watchpoint(ptr, size, is_write, watchpoint, + kcsan_found_watchpoint(ptr, size, type, watchpoint, encoded_watchpoint); else if (unlikely(should_watch(ptr, type))) - kcsan_setup_watchpoint(ptr, size, is_write); + kcsan_setup_watchpoint(ptr, size, type); } /* === Public interface ===================================================== */ diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index d3b9a96ac8a4..8492da45494b 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -103,7 +103,7 @@ enum kcsan_report_type { /* * Print a race report from thread that encountered the race. */ -extern void kcsan_report(const volatile void *ptr, size_t size, bool is_write, +extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, bool value_change, int cpu_id, enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 0eea05a3135b..9f503ca2ff7a 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -24,7 +24,7 @@ static struct { const volatile void *ptr; size_t size; - bool is_write; + int access_type; int task_pid; int cpu_id; unsigned long stack_entries[NUM_STACK_ENTRIES]; @@ -41,8 +41,10 @@ static DEFINE_SPINLOCK(report_lock); * Special rules to skip reporting. */ static bool -skip_report(bool is_write, bool value_change, unsigned long top_frame) +skip_report(int access_type, bool value_change, unsigned long top_frame) { + const bool is_write = (access_type & KCSAN_ACCESS_WRITE) != 0; + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && !value_change) { /* @@ -63,9 +65,20 @@ skip_report(bool is_write, bool value_change, unsigned long top_frame) return kcsan_skip_report_debugfs(top_frame); } -static inline const char *get_access_type(bool is_write) +static const char *get_access_type(int type) { - return is_write ? "write" : "read"; + switch (type) { + case 0: + return "read"; + case KCSAN_ACCESS_ATOMIC: + return "read (marked)"; + case KCSAN_ACCESS_WRITE: + return "write"; + case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "write (marked)"; + default: + BUG(); + } } /* Return thread description: in task or interrupt. */ @@ -112,7 +125,7 @@ static int sym_strcmp(void *addr1, void *addr2) /* * Returns true if a report was generated, false otherwise. */ -static bool print_report(const volatile void *ptr, size_t size, bool is_write, +static bool print_report(const volatile void *ptr, size_t size, int access_type, bool value_change, int cpu_id, enum kcsan_report_type type) { @@ -124,7 +137,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, /* * Must check report filter rules before starting to print. */ - if (skip_report(is_write, true, stack_entries[skipnr])) + if (skip_report(access_type, true, stack_entries[skipnr])) return false; if (type == KCSAN_REPORT_RACE_SIGNAL) { @@ -132,7 +145,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, other_info.num_stack_entries); /* @value_change is only known for the other thread */ - if (skip_report(other_info.is_write, value_change, + if (skip_report(other_info.access_type, value_change, other_info.stack_entries[other_skipnr])) return false; } @@ -170,7 +183,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, switch (type) { case KCSAN_REPORT_RACE_SIGNAL: pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(other_info.is_write), other_info.ptr, + get_access_type(other_info.access_type), other_info.ptr, other_info.size, get_thread_desc(other_info.task_pid), other_info.cpu_id); @@ -181,14 +194,14 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, pr_err("\n"); pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(is_write), ptr, size, + get_access_type(access_type), ptr, size, get_thread_desc(in_task() ? task_pid_nr(current) : -1), cpu_id); break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(is_write), ptr, size, + get_access_type(access_type), ptr, size, get_thread_desc(in_task() ? task_pid_nr(current) : -1), cpu_id); break; @@ -223,7 +236,7 @@ static void release_report(unsigned long *flags, enum kcsan_report_type type) * required for the report type, simply acquires report_lock and returns true. */ static bool prepare_report(unsigned long *flags, const volatile void *ptr, - size_t size, bool is_write, int cpu_id, + size_t size, int access_type, int cpu_id, enum kcsan_report_type type) { if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && @@ -243,7 +256,7 @@ retry: other_info.ptr = ptr; other_info.size = size; - other_info.is_write = is_write; + other_info.access_type = access_type; other_info.task_pid = in_task() ? task_pid_nr(current) : -1; other_info.cpu_id = cpu_id; other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); @@ -302,14 +315,14 @@ retry: goto retry; } -void kcsan_report(const volatile void *ptr, size_t size, bool is_write, +void kcsan_report(const volatile void *ptr, size_t size, int access_type, bool value_change, int cpu_id, enum kcsan_report_type type) { unsigned long flags = 0; kcsan_disable_current(); - if (prepare_report(&flags, ptr, size, is_write, cpu_id, type)) { - if (print_report(ptr, size, is_write, value_change, cpu_id, type) && panic_on_warn) + if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { + if (print_report(ptr, size, access_type, value_change, cpu_id, type) && panic_on_warn) panic("panic_on_warn set ...\n"); release_report(&flags, type); -- cgit v1.2.3 From 05f9a4067964e3f864210271a6299f13d2eeea55 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 10 Jan 2020 19:48:34 +0100 Subject: kcsan: Rate-limit reporting per data races KCSAN data-race reports can occur quite frequently, so much so as to render the system useless. This commit therefore adds support for time-based rate-limiting KCSAN reports, with the time interval specified by a new KCSAN_REPORT_ONCE_IN_MS Kconfig option. The default is 3000 milliseconds, also known as three seconds. Because KCSAN must detect data races in allocators and in other contexts where use of allocation is ill-advised, a fixed-size array is used to buffer reports during each reporting interval. To reduce the number of reports lost due to array overflow, this commit stores only one instance of duplicate reports, which has the benefit of further reducing KCSAN's console output rate. Reported-by: Qian Cai Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/report.c | 110 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 9f503ca2ff7a..b5b4feea49de 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -31,12 +32,99 @@ static struct { int num_stack_entries; } other_info = { .ptr = NULL }; +/* + * Information about reported data races; used to rate limit reporting. + */ +struct report_time { + /* + * The last time the data race was reported. + */ + unsigned long time; + + /* + * The frames of the 2 threads; if only 1 thread is known, one frame + * will be 0. + */ + unsigned long frame1; + unsigned long frame2; +}; + +/* + * Since we also want to be able to debug allocators with KCSAN, to avoid + * deadlock, report_times cannot be dynamically resized with krealloc in + * rate_limit_report. + * + * Therefore, we use a fixed-size array, which at most will occupy a page. This + * still adequately rate limits reports, assuming that a) number of unique data + * races is not excessive, and b) occurrence of unique data races within the + * same time window is limited. + */ +#define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time)) +#define REPORT_TIMES_SIZE \ + (CONFIG_KCSAN_REPORT_ONCE_IN_MS > REPORT_TIMES_MAX ? \ + REPORT_TIMES_MAX : \ + CONFIG_KCSAN_REPORT_ONCE_IN_MS) +static struct report_time report_times[REPORT_TIMES_SIZE]; + /* * This spinlock protects reporting and other_info, since other_info is usually * required when reporting. */ static DEFINE_SPINLOCK(report_lock); +/* + * Checks if the data race identified by thread frames frame1 and frame2 has + * been reported since (now - KCSAN_REPORT_ONCE_IN_MS). + */ +static bool rate_limit_report(unsigned long frame1, unsigned long frame2) +{ + struct report_time *use_entry = &report_times[0]; + unsigned long invalid_before; + int i; + + BUILD_BUG_ON(CONFIG_KCSAN_REPORT_ONCE_IN_MS != 0 && REPORT_TIMES_SIZE == 0); + + if (CONFIG_KCSAN_REPORT_ONCE_IN_MS == 0) + return false; + + invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS); + + /* Check if a matching data race report exists. */ + for (i = 0; i < REPORT_TIMES_SIZE; ++i) { + struct report_time *rt = &report_times[i]; + + /* + * Must always select an entry for use to store info as we + * cannot resize report_times; at the end of the scan, use_entry + * will be the oldest entry, which ideally also happened before + * KCSAN_REPORT_ONCE_IN_MS ago. + */ + if (time_before(rt->time, use_entry->time)) + use_entry = rt; + + /* + * Initially, no need to check any further as this entry as well + * as following entries have never been used. + */ + if (rt->time == 0) + break; + + /* Check if entry expired. */ + if (time_before(rt->time, invalid_before)) + continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */ + + /* Reported recently, check if data race matches. */ + if ((rt->frame1 == frame1 && rt->frame2 == frame2) || + (rt->frame1 == frame2 && rt->frame2 == frame1)) + return true; + } + + use_entry->time = jiffies; + use_entry->frame1 = frame1; + use_entry->frame2 = frame2; + return false; +} + /* * Special rules to skip reporting. */ @@ -132,7 +220,9 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); - int other_skipnr; + unsigned long this_frame = stack_entries[skipnr]; + unsigned long other_frame = 0; + int other_skipnr = 0; /* silence uninit warnings */ /* * Must check report filter rules before starting to print. @@ -143,34 +233,34 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, if (type == KCSAN_REPORT_RACE_SIGNAL) { other_skipnr = get_stack_skipnr(other_info.stack_entries, other_info.num_stack_entries); + other_frame = other_info.stack_entries[other_skipnr]; /* @value_change is only known for the other thread */ - if (skip_report(other_info.access_type, value_change, - other_info.stack_entries[other_skipnr])) + if (skip_report(other_info.access_type, value_change, other_frame)) return false; } + if (rate_limit_report(this_frame, other_frame)) + return false; + /* Print report header. */ pr_err("==================================================================\n"); switch (type) { case KCSAN_REPORT_RACE_SIGNAL: { - void *this_fn = (void *)stack_entries[skipnr]; - void *other_fn = (void *)other_info.stack_entries[other_skipnr]; int cmp; /* * Order functions lexographically for consistent bug titles. * Do not print offset of functions to keep title short. */ - cmp = sym_strcmp(other_fn, this_fn); + cmp = sym_strcmp((void *)other_frame, (void *)this_frame); pr_err("BUG: KCSAN: data-race in %ps / %ps\n", - cmp < 0 ? other_fn : this_fn, - cmp < 0 ? this_fn : other_fn); + (void *)(cmp < 0 ? other_frame : this_frame), + (void *)(cmp < 0 ? this_frame : other_frame)); } break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: - pr_err("BUG: KCSAN: data-race in %pS\n", - (void *)stack_entries[skipnr]); + pr_err("BUG: KCSAN: data-race in %pS\n", (void *)this_frame); break; default: -- cgit v1.2.3 From f1bc96210c6a6b853b4b2eec808141956e8fbc5d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 15 Jan 2020 17:25:12 +0100 Subject: kcsan: Make KCSAN compatible with lockdep We must avoid any recursion into lockdep if KCSAN is enabled on utilities used by lockdep. One manifestation of this is corruption of lockdep's IRQ trace state (if TRACE_IRQFLAGS), resulting in spurious warnings (see below). This commit fixes this by: 1. Using raw_local_irq{save,restore} in kcsan_setup_watchpoint(). 2. Disabling lockdep in kcsan_report(). Tested with: CONFIG_LOCKDEP=y CONFIG_DEBUG_LOCKDEP=y CONFIG_TRACE_IRQFLAGS=y This fix eliminates spurious warnings such as the following one: WARNING: CPU: 0 PID: 2 at kernel/locking/lockdep.c:4406 check_flags.part.0+0x101/0x220 Modules linked in: CPU: 0 PID: 2 Comm: kthreadd Not tainted 5.5.0-rc1+ #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:check_flags.part.0+0x101/0x220 Call Trace: lock_is_held_type+0x69/0x150 freezer_fork+0x20b/0x370 cgroup_post_fork+0x2c9/0x5c0 copy_process+0x2675/0x3b40 _do_fork+0xbe/0xa30 ? _raw_spin_unlock_irqrestore+0x40/0x50 ? match_held_lock+0x56/0x250 ? kthread_park+0xf0/0xf0 kernel_thread+0xa6/0xd0 ? kthread_park+0xf0/0xf0 kthreadd+0x321/0x3d0 ? kthread_create_on_cpu+0x130/0x130 ret_from_fork+0x3a/0x50 irq event stamp: 64 hardirqs last enabled at (63): [] _raw_spin_unlock_irqrestore+0x40/0x50 hardirqs last disabled at (64): [] kcsan_setup_watchpoint+0x92/0x460 softirqs last enabled at (32): [] fpu__copy+0xe8/0x470 softirqs last disabled at (30): [] fpu__copy+0x69/0x470 Reported-by: Qian Cai Signed-off-by: Marco Elver Acked-by: Alexander Potapenko Tested-by: Qian Cai Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 6 ++++-- kernel/kcsan/report.c | 11 +++++++++++ kernel/locking/Makefile | 3 +++ 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 87bf857c8893..64b30f7716a1 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -336,8 +336,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * CPU-local data accesses), it makes more sense (from a data race * detection point of view) to simply disable preemptions to ensure * as many tasks as possible run on other CPUs. + * + * Use raw versions, to avoid lockdep recursion via IRQ flags tracing. */ - local_irq_save(irq_flags); + raw_local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -429,7 +431,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: - local_irq_restore(irq_flags); + raw_local_irq_restore(irq_flags); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index b5b4feea49de..33bdf8b229b5 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -410,6 +411,14 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, { unsigned long flags = 0; + /* + * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if + * we do not turn off lockdep here; this could happen due to recursion + * into lockdep via KCSAN if we detect a data race in utilities used by + * lockdep. + */ + lockdep_off(); + kcsan_disable_current(); if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { if (print_report(ptr, size, access_type, value_change, cpu_id, type) && panic_on_warn) @@ -418,4 +427,6 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, release_report(&flags, type); } kcsan_enable_current(); + + lockdep_on(); } diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 45452facff3b..6d11cfb9b41f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,6 +5,9 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +KCSAN_SANITIZE_lockdep.o := n + ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) -- cgit v1.2.3 From ad4f8eeca8eaa24afb6059c241a2f4baf86378f3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jan 2020 16:01:02 +0100 Subject: kcsan: Address missing case with KCSAN_REPORT_VALUE_CHANGE_ONLY Even with KCSAN_REPORT_VALUE_CHANGE_ONLY, KCSAN still reports data races between reads and watchpointed writes, even if the writes wrote values already present. This commit causes KCSAN to unconditionally skip reporting in this case. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/report.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 33bdf8b229b5..7cd34285df74 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -130,12 +130,25 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) * Special rules to skip reporting. */ static bool -skip_report(int access_type, bool value_change, unsigned long top_frame) +skip_report(bool value_change, unsigned long top_frame) { - const bool is_write = (access_type & KCSAN_ACCESS_WRITE) != 0; - - if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && - !value_change) { + /* + * The first call to skip_report always has value_change==true, since we + * cannot know the value written of an instrumented access. For the 2nd + * call there are 6 cases with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY: + * + * 1. read watchpoint, conflicting write (value_change==true): report; + * 2. read watchpoint, conflicting write (value_change==false): skip; + * 3. write watchpoint, conflicting write (value_change==true): report; + * 4. write watchpoint, conflicting write (value_change==false): skip; + * 5. write watchpoint, conflicting read (value_change==false): skip; + * 6. write watchpoint, conflicting read (value_change==true): impossible; + * + * Cases 1-4 are intuitive and expected; case 5 ensures we do not report + * data races where the write may have rewritten the same value; and + * case 6 is simply impossible. + */ + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && !value_change) { /* * The access is a write, but the data value did not change. * @@ -228,7 +241,7 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, /* * Must check report filter rules before starting to print. */ - if (skip_report(access_type, true, stack_entries[skipnr])) + if (skip_report(true, stack_entries[skipnr])) return false; if (type == KCSAN_REPORT_RACE_SIGNAL) { @@ -237,7 +250,7 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, other_frame = other_info.stack_entries[other_skipnr]; /* @value_change is only known for the other thread */ - if (skip_report(other_info.access_type, value_change, other_frame)) + if (skip_report(value_change, other_frame)) return false; } -- cgit v1.2.3 From 1e6ee2f0fe8ae682757960edf455e99f611268a0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 Feb 2020 18:21:10 +0100 Subject: kcsan: Add option to assume plain aligned writes up to word size are atomic This adds option KCSAN_ASSUME_PLAIN_WRITES_ATOMIC. If enabled, plain aligned writes up to word size are assumed to be atomic, and also not subject to other unsafe compiler optimizations resulting in data races. This option has been enabled by default to reflect current kernel-wide preferences. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 64b30f7716a1..e3c7d8f34f2f 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -169,10 +170,20 @@ static __always_inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } -static __always_inline bool is_atomic(const volatile void *ptr) +static __always_inline bool +is_atomic(const volatile void *ptr, size_t size, int type) { - struct kcsan_ctx *ctx = get_ctx(); + struct kcsan_ctx *ctx; + + if ((type & KCSAN_ACCESS_ATOMIC) != 0) + return true; + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && + (type & KCSAN_ACCESS_WRITE) != 0 && size <= sizeof(long) && + IS_ALIGNED((unsigned long)ptr, size)) + return true; /* Assume aligned writes up to word size are atomic. */ + + ctx = get_ctx(); if (unlikely(ctx->atomic_next > 0)) { /* * Because we do not have separate contexts for nested @@ -193,7 +204,8 @@ static __always_inline bool is_atomic(const volatile void *ptr) return kcsan_is_atomic(ptr); } -static __always_inline bool should_watch(const volatile void *ptr, int type) +static __always_inline bool +should_watch(const volatile void *ptr, size_t size, int type) { /* * Never set up watchpoints when memory operations are atomic. @@ -202,7 +214,7 @@ static __always_inline bool should_watch(const volatile void *ptr, int type) * should not count towards skipped instructions, and (2) to actually * decrement kcsan_atomic_next for consecutive instruction stream. */ - if ((type & KCSAN_ACCESS_ATOMIC) != 0 || is_atomic(ptr)) + if (is_atomic(ptr, size, type)) return false; if (this_cpu_dec_return(kcsan_skip) >= 0) @@ -460,7 +472,7 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, if (unlikely(watchpoint != NULL)) kcsan_found_watchpoint(ptr, size, type, watchpoint, encoded_watchpoint); - else if (unlikely(should_watch(ptr, type))) + else if (unlikely(should_watch(ptr, size, type))) kcsan_setup_watchpoint(ptr, size, type); } -- cgit v1.2.3 From ed95f95c86cd53621103d865d62b5e1f96e60edb Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 5 Feb 2020 11:14:19 +0100 Subject: kcsan: Fix 0-sized checks Instrumentation of arbitrary memory-copy functions, such as user-copies, may be called with size of 0, which could lead to false positives. To avoid this, add a comparison in check_access() for size==0, which will be optimized out for constant sized instrumentation (__tsan_{read,write}N), and therefore not affect the common-case fast-path. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 7 +++++++ kernel/kcsan/test.c | 10 ++++++++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index e3c7d8f34f2f..82c2bef827d4 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -455,6 +455,13 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, atomic_long_t *watchpoint; long encoded_watchpoint; + /* + * Do nothing for 0 sized check; this comparison will be optimized out + * for constant sized instrumentation (__tsan_{read,write}N). + */ + if (unlikely(size == 0)) + return; + /* * Avoid user_access_save in fast-path: find_watchpoint is safe without * user_access_save, as the address that ptr points to is only used to diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c index cc6000239dc0..d26a052d3383 100644 --- a/kernel/kcsan/test.c +++ b/kernel/kcsan/test.c @@ -92,6 +92,16 @@ static bool test_matching_access(void) return false; if (WARN_ON(matching_access(9, 1, 10, 1))) return false; + + /* + * An access of size 0 could match another access, as demonstrated here. + * Rather than add more comparisons to 'matching_access()', which would + * end up in the fast-path for *all* checks, check_access() simply + * returns for all accesses of size 0. + */ + if (WARN_ON(!matching_access(8, 8, 12, 0))) + return false; + return true; } -- cgit v1.2.3 From d591ec3db75f9eadfa7976ff8796c674c0027715 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 6 Feb 2020 16:46:24 +0100 Subject: kcsan: Introduce KCSAN_ACCESS_ASSERT access type The KCSAN_ACCESS_ASSERT access type may be used to introduce dummy reads and writes to assert certain properties of concurrent code, where bugs could not be detected as normal data races. For example, a variable that is only meant to be written by a single CPU, but may be read (without locking) by other CPUs must still be marked properly to avoid data races. However, concurrent writes, regardless if WRITE_ONCE() or not, would be a bug. Using kcsan_check_access(&x, sizeof(x), KCSAN_ACCESS_ASSERT) would allow catching such bugs. To support KCSAN_ACCESS_ASSERT the following notable changes were made: * If an access is of type KCSAN_ASSERT_ACCESS, disable various filters that only apply to data races, so that all races that KCSAN observes are reported. * Bug reports that involve an ASSERT access type will be reported as "KCSAN: assert: race in ..." instead of "data-race"; this will help more easily distinguish them. * Update a few comments to just mention 'races' where we do not always mean pure data races. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 44 ++++++++++++++++++++++++++++++++++++++------ kernel/kcsan/debugfs.c | 1 + kernel/kcsan/kcsan.h | 7 +++++++ kernel/kcsan/report.c | 43 +++++++++++++++++++++++++++++++------------ 4 files changed, 77 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 82c2bef827d4..87ef01e40199 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -56,7 +56,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { /* * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary - * slot (middle) is fine if we assume that data races occur rarely. The set of + * slot (middle) is fine if we assume that races occur rarely. The set of * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. */ @@ -178,6 +178,14 @@ is_atomic(const volatile void *ptr, size_t size, int type) if ((type & KCSAN_ACCESS_ATOMIC) != 0) return true; + /* + * Unless explicitly declared atomic, never consider an assertion access + * as atomic. This allows using them also in atomic regions, such as + * seqlocks, without implicitly changing their semantics. + */ + if ((type & KCSAN_ACCESS_ASSERT) != 0) + return false; + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && (type & KCSAN_ACCESS_WRITE) != 0 && size <= sizeof(long) && IS_ALIGNED((unsigned long)ptr, size)) @@ -298,7 +306,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, */ kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); } - kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + + if ((type & KCSAN_ACCESS_ASSERT) != 0) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + else + kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); user_access_restore(flags); } @@ -307,6 +319,7 @@ static noinline void kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) { const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; + const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; atomic_long_t *watchpoint; union { u8 _1; @@ -429,13 +442,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) /* * No need to increment 'data_races' counter, as the racing * thread already did. + * + * Count 'assert_failures' for each failed ASSERT access, + * therefore both this thread and the racing thread may + * increment this counter. */ - kcsan_report(ptr, size, type, size > 8 || value_change, - smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); + if (is_assert) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + + /* + * - If we were not able to observe a value change due to size + * constraints, always assume a value change. + * - If the access type is an assertion, we also always assume a + * value change to always report the race. + */ + value_change = value_change || size > 8 || is_assert; + + kcsan_report(ptr, size, type, value_change, smp_processor_id(), + KCSAN_REPORT_RACE_SIGNAL); } else if (value_change) { /* Inferring a race, since the value should not have changed. */ + kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); - if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + if (is_assert) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, true, smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); @@ -471,7 +503,7 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, &encoded_watchpoint); /* * It is safe to check kcsan_is_enabled() after find_watchpoint in the - * slow-path, as long as no state changes that cause a data race to be + * slow-path, as long as no state changes that cause a race to be * detected and reported have occurred until kcsan_is_enabled() is * checked. */ diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index bec42dab32ee..a9dad44130e6 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -44,6 +44,7 @@ static const char *counter_to_name(enum kcsan_counter_id id) case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; case KCSAN_COUNTER_DATA_RACES: return "data_races"; + case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures"; case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; case KCSAN_COUNTER_REPORT_RACES: return "report_races"; case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 8492da45494b..50078e7d43c3 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -39,6 +39,13 @@ enum kcsan_counter_id { */ KCSAN_COUNTER_DATA_RACES, + /* + * Total number of ASSERT failures due to races. If the observed race is + * due to two conflicting ASSERT type accesses, then both will be + * counted. + */ + KCSAN_COUNTER_ASSERT_FAILURES, + /* * Number of times no watchpoints were available. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 7cd34285df74..3bc590e6be7e 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -34,11 +34,11 @@ static struct { } other_info = { .ptr = NULL }; /* - * Information about reported data races; used to rate limit reporting. + * Information about reported races; used to rate limit reporting. */ struct report_time { /* - * The last time the data race was reported. + * The last time the race was reported. */ unsigned long time; @@ -57,7 +57,7 @@ struct report_time { * * Therefore, we use a fixed-size array, which at most will occupy a page. This * still adequately rate limits reports, assuming that a) number of unique data - * races is not excessive, and b) occurrence of unique data races within the + * races is not excessive, and b) occurrence of unique races within the * same time window is limited. */ #define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time)) @@ -74,7 +74,7 @@ static struct report_time report_times[REPORT_TIMES_SIZE]; static DEFINE_SPINLOCK(report_lock); /* - * Checks if the data race identified by thread frames frame1 and frame2 has + * Checks if the race identified by thread frames frame1 and frame2 has * been reported since (now - KCSAN_REPORT_ONCE_IN_MS). */ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) @@ -90,7 +90,7 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS); - /* Check if a matching data race report exists. */ + /* Check if a matching race report exists. */ for (i = 0; i < REPORT_TIMES_SIZE; ++i) { struct report_time *rt = &report_times[i]; @@ -114,7 +114,7 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) if (time_before(rt->time, invalid_before)) continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */ - /* Reported recently, check if data race matches. */ + /* Reported recently, check if race matches. */ if ((rt->frame1 == frame1 && rt->frame2 == frame2) || (rt->frame1 == frame2 && rt->frame2 == frame1)) return true; @@ -142,11 +142,12 @@ skip_report(bool value_change, unsigned long top_frame) * 3. write watchpoint, conflicting write (value_change==true): report; * 4. write watchpoint, conflicting write (value_change==false): skip; * 5. write watchpoint, conflicting read (value_change==false): skip; - * 6. write watchpoint, conflicting read (value_change==true): impossible; + * 6. write watchpoint, conflicting read (value_change==true): report; * * Cases 1-4 are intuitive and expected; case 5 ensures we do not report - * data races where the write may have rewritten the same value; and - * case 6 is simply impossible. + * data races where the write may have rewritten the same value; case 6 + * is possible either if the size is larger than what we check value + * changes for or the access type is KCSAN_ACCESS_ASSERT. */ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && !value_change) { /* @@ -178,11 +179,27 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; + + /* + * ASSERT variants: + */ + case KCSAN_ACCESS_ASSERT: + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_ATOMIC: + return "assert no writes"; + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE: + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "assert no accesses"; + default: BUG(); } } +static const char *get_bug_type(int type) +{ + return (type & KCSAN_ACCESS_ASSERT) != 0 ? "assert: race" : "data-race"; +} + /* Return thread description: in task or interrupt. */ static const char *get_thread_desc(int task_id) { @@ -268,13 +285,15 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, * Do not print offset of functions to keep title short. */ cmp = sym_strcmp((void *)other_frame, (void *)this_frame); - pr_err("BUG: KCSAN: data-race in %ps / %ps\n", + pr_err("BUG: KCSAN: %s in %ps / %ps\n", + get_bug_type(access_type | other_info.access_type), (void *)(cmp < 0 ? other_frame : this_frame), (void *)(cmp < 0 ? this_frame : other_frame)); } break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: - pr_err("BUG: KCSAN: data-race in %pS\n", (void *)this_frame); + pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(access_type), + (void *)this_frame); break; default: @@ -427,7 +446,7 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, /* * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if * we do not turn off lockdep here; this could happen due to recursion - * into lockdep via KCSAN if we detect a data race in utilities used by + * into lockdep via KCSAN if we detect a race in utilities used by * lockdep. */ lockdep_off(); -- cgit v1.2.3 From a312013578e4775003689e31c1f487df11f362a3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 6 Feb 2020 16:46:26 +0100 Subject: kcsan: Add test to generate conflicts via debugfs Add 'test=' option to KCSAN's debugfs interface to invoke KCSAN checks on a dummy variable. By writing 'test=' to the debugfs file from multiple tasks, we can generate real conflicts, and trigger data race reports. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/debugfs.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index a9dad44130e6..9bbba0e57c9b 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -69,9 +70,9 @@ void kcsan_counter_dec(enum kcsan_counter_id id) /* * The microbenchmark allows benchmarking KCSAN core runtime only. To run * multiple threads, pipe 'microbench=' from multiple tasks into the - * debugfs file. + * debugfs file. This will not generate any conflicts, and tests fast-path only. */ -static void microbenchmark(unsigned long iters) +static noinline void microbenchmark(unsigned long iters) { cycles_t cycles; @@ -81,18 +82,52 @@ static void microbenchmark(unsigned long iters) while (iters--) { /* * We can run this benchmark from multiple tasks; this address - * calculation increases likelyhood of some accesses overlapping - * (they still won't conflict because all are reads). + * calculation increases likelyhood of some accesses + * overlapping. Make the access type an atomic read, to never + * set up watchpoints and test the fast-path only. */ unsigned long addr = iters % (CONFIG_KCSAN_NUM_WATCHPOINTS * PAGE_SIZE); - __kcsan_check_read((void *)addr, sizeof(long)); + __kcsan_check_access((void *)addr, sizeof(long), KCSAN_ACCESS_ATOMIC); } cycles = get_cycles() - cycles; pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); } +/* + * Simple test to create conflicting accesses. Write 'test=' to KCSAN's + * debugfs file from multiple tasks to generate real conflicts and show reports. + */ +static long test_dummy; +static noinline void test_thread(unsigned long iters) +{ + const struct kcsan_ctx ctx_save = current->kcsan_ctx; + cycles_t cycles; + + /* We may have been called from an atomic region; reset context. */ + memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); + + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + + cycles = get_cycles(); + while (iters--) { + __kcsan_check_read(&test_dummy, sizeof(test_dummy)); + __kcsan_check_write(&test_dummy, sizeof(test_dummy)); + ASSERT_EXCLUSIVE_WRITER(test_dummy); + ASSERT_EXCLUSIVE_ACCESS(test_dummy); + + /* not actually instrumented */ + WRITE_ONCE(test_dummy, iters); /* to observe value-change */ + } + cycles = get_cycles() - cycles; + + pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + + /* restore context */ + current->kcsan_ctx = ctx_save; +} + static int cmp_filterlist_addrs(const void *rhs, const void *lhs) { const unsigned long a = *(const unsigned long *)rhs; @@ -242,6 +277,12 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) return -EINVAL; microbenchmark(iters); + } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) { + unsigned long iters; + + if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters)) + return -EINVAL; + test_thread(iters); } else if (!strcmp(arg, "whitelist")) { set_report_filterlist_whitelist(true); } else if (!strcmp(arg, "blacklist")) { -- cgit v1.2.3 From 80d4c4775216602ccdc9e761ce251c8451d0c6ca Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 7 Feb 2020 19:59:10 +0100 Subject: kcsan: Expose core configuration parameters as module params This adds early_boot, udelay_{task,interrupt}, and skip_watch as module params. The latter parameters are useful to modify at runtime to tune KCSAN's performance on new systems. This will also permit auto-tuning these parameters to maximize overall system performance and KCSAN's race detection ability. None of the parameters are used in the fast-path and referring to them via static variables instead of CONFIG constants will not affect performance. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Cc: Qian Cai --- kernel/kcsan/core.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 87ef01e40199..498b1eb3c1cd 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,20 @@ #include "encoding.h" #include "kcsan.h" +static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); +static unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; +static unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; +static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kcsan." +module_param_named(early_enable, kcsan_early_enable, bool, 0); +module_param_named(udelay_task, kcsan_udelay_task, uint, 0644); +module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644); +module_param_named(skip_watch, kcsan_skip_watch, long, 0644); + bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ @@ -239,9 +254,9 @@ should_watch(const volatile void *ptr, size_t size, int type) static inline void reset_kcsan_skip(void) { - long skip_count = CONFIG_KCSAN_SKIP_WATCH - + long skip_count = kcsan_skip_watch - (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? - prandom_u32_max(CONFIG_KCSAN_SKIP_WATCH) : + prandom_u32_max(kcsan_skip_watch) : 0); this_cpu_write(kcsan_skip, skip_count); } @@ -253,8 +268,7 @@ static __always_inline bool kcsan_is_enabled(void) static inline unsigned int get_delay(void) { - unsigned int delay = in_task() ? CONFIG_KCSAN_UDELAY_TASK : - CONFIG_KCSAN_UDELAY_INTERRUPT; + unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? prandom_u32_max(delay) : 0); @@ -527,7 +541,7 @@ void __init kcsan_init(void) * We are in the init task, and no other tasks should be running; * WRITE_ONCE without memory barrier is sufficient. */ - if (IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE)) + if (kcsan_early_enable) WRITE_ONCE(kcsan_enabled, true); } -- cgit v1.2.3 From 3a5b45e5031fa395ab6e53545fb726b2d5b104f0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 10 Feb 2020 15:56:39 +0100 Subject: kcsan: Fix misreporting if concurrent races on same address If there are at least 4 threads racing on the same address, it can happen that one of the readers may observe another matching reader in other_info. To avoid locking up, we have to consume 'other_info' regardless, but skip the report. See the added comment for more details. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/report.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 3bc590e6be7e..abf6852dff72 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -422,6 +422,44 @@ retry: return false; } + access_type |= other_info.access_type; + if ((access_type & KCSAN_ACCESS_WRITE) == 0) { + /* + * While the address matches, this is not the other_info + * from the thread that consumed our watchpoint, since + * neither this nor the access in other_info is a write. + * It is invalid to continue with the report, since we + * only have information about reads. + * + * This can happen due to concurrent races on the same + * address, with at least 4 threads. To avoid locking up + * other_info and all other threads, we have to consume + * it regardless. + * + * A concrete case to illustrate why we might lock up if + * we do not consume other_info: + * + * We have 4 threads, all accessing the same address + * (or matching address ranges). Assume the following + * watcher and watchpoint consumer pairs: + * write1-read1, read2-write2. The first to populate + * other_info is write2, however, write1 consumes it, + * resulting in a report of write1-write2. This report + * is valid, however, now read1 populates other_info; + * read2-read1 is an invalid conflict, yet, no other + * conflicting access is left. Therefore, we must + * consume read1's other_info. + * + * Since this case is assumed to be rare, it is + * reasonable to omit this report: one of the other + * reports includes information about the same shared + * data, and at this point the likelihood that we + * re-report the same race again is high. + */ + release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + return false; + } + /* * Matching & usable access in other_info: keep other_info_lock * locked, as this thread consumes it to print the full report; -- cgit v1.2.3 From b738f6169f1260b4ed5bd9f220b1c84d79f3ab8d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:21 +0100 Subject: kcsan: Introduce kcsan_value_change type Introduces kcsan_value_change type, which explicitly points out if we either observed a value-change (TRUE), or we could not observe one but cannot rule out a value-change happened (MAYBE). The MAYBE state can either be reported or not, depending on configuration preferences. A follow-up patch introduces the FALSE state, which should never be reported. No functional change intended. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 38 ++++++++++++++++++++++---------------- kernel/kcsan/kcsan.h | 19 ++++++++++++++++++- kernel/kcsan/report.c | 26 ++++++++++++++------------ 3 files changed, 54 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 498b1eb3c1cd..3f89801161d3 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -341,7 +341,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) u32 _4; u64 _8; } expect_value; - bool value_change = false; + enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; @@ -398,6 +398,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Read the current value, to later check and infer a race if the data * was modified via a non-instrumented access, e.g. from a device. */ + expect_value._8 = 0; switch (size) { case 1: expect_value._1 = READ_ONCE(*(const u8 *)ptr); @@ -436,23 +437,36 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) */ switch (size) { case 1: - value_change = expect_value._1 != READ_ONCE(*(const u8 *)ptr); + expect_value._1 ^= READ_ONCE(*(const u8 *)ptr); break; case 2: - value_change = expect_value._2 != READ_ONCE(*(const u16 *)ptr); + expect_value._2 ^= READ_ONCE(*(const u16 *)ptr); break; case 4: - value_change = expect_value._4 != READ_ONCE(*(const u32 *)ptr); + expect_value._4 ^= READ_ONCE(*(const u32 *)ptr); break; case 8: - value_change = expect_value._8 != READ_ONCE(*(const u64 *)ptr); + expect_value._8 ^= READ_ONCE(*(const u64 *)ptr); break; default: break; /* ignore; we do not diff the values */ } + /* Were we able to observe a value-change? */ + if (expect_value._8 != 0) + value_change = KCSAN_VALUE_CHANGE_TRUE; + /* Check if this access raced with another. */ if (!remove_watchpoint(watchpoint)) { + /* + * Depending on the access type, map a value_change of MAYBE to + * TRUE (require reporting). + */ + if (value_change == KCSAN_VALUE_CHANGE_MAYBE && (size > 8 || is_assert)) { + /* Always assume a value-change. */ + value_change = KCSAN_VALUE_CHANGE_TRUE; + } + /* * No need to increment 'data_races' counter, as the racing * thread already did. @@ -461,20 +475,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * therefore both this thread and the racing thread may * increment this counter. */ - if (is_assert) + if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); - /* - * - If we were not able to observe a value change due to size - * constraints, always assume a value change. - * - If the access type is an assertion, we also always assume a - * value change to always report the race. - */ - value_change = value_change || size > 8 || is_assert; - kcsan_report(ptr, size, type, value_change, smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); - } else if (value_change) { + } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); @@ -482,7 +488,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) - kcsan_report(ptr, size, type, true, + kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 50078e7d43c3..83a79b08b550 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -88,6 +88,22 @@ extern void kcsan_counter_dec(enum kcsan_counter_id id); */ extern bool kcsan_skip_report_debugfs(unsigned long func_addr); +/* + * Value-change states. + */ +enum kcsan_value_change { + /* + * Did not observe a value-change, however, it is valid to report the + * race, depending on preferences. + */ + KCSAN_VALUE_CHANGE_MAYBE, + + /* + * The value was observed to change, and the race should be reported. + */ + KCSAN_VALUE_CHANGE_TRUE, +}; + enum kcsan_report_type { /* * The thread that set up the watchpoint and briefly stalled was @@ -111,6 +127,7 @@ enum kcsan_report_type { * Print a race report from thread that encountered the race. */ extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, - bool value_change, int cpu_id, enum kcsan_report_type type); + enum kcsan_value_change value_change, int cpu_id, + enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index abf6852dff72..d871476dc134 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -130,26 +130,27 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) * Special rules to skip reporting. */ static bool -skip_report(bool value_change, unsigned long top_frame) +skip_report(enum kcsan_value_change value_change, unsigned long top_frame) { /* - * The first call to skip_report always has value_change==true, since we + * The first call to skip_report always has value_change==TRUE, since we * cannot know the value written of an instrumented access. For the 2nd * call there are 6 cases with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY: * - * 1. read watchpoint, conflicting write (value_change==true): report; - * 2. read watchpoint, conflicting write (value_change==false): skip; - * 3. write watchpoint, conflicting write (value_change==true): report; - * 4. write watchpoint, conflicting write (value_change==false): skip; - * 5. write watchpoint, conflicting read (value_change==false): skip; - * 6. write watchpoint, conflicting read (value_change==true): report; + * 1. read watchpoint, conflicting write (value_change==TRUE): report; + * 2. read watchpoint, conflicting write (value_change==MAYBE): skip; + * 3. write watchpoint, conflicting write (value_change==TRUE): report; + * 4. write watchpoint, conflicting write (value_change==MAYBE): skip; + * 5. write watchpoint, conflicting read (value_change==MAYBE): skip; + * 6. write watchpoint, conflicting read (value_change==TRUE): report; * * Cases 1-4 are intuitive and expected; case 5 ensures we do not report * data races where the write may have rewritten the same value; case 6 * is possible either if the size is larger than what we check value * changes for or the access type is KCSAN_ACCESS_ASSERT. */ - if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && !value_change) { + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && + value_change == KCSAN_VALUE_CHANGE_MAYBE) { /* * The access is a write, but the data value did not change. * @@ -245,7 +246,7 @@ static int sym_strcmp(void *addr1, void *addr2) * Returns true if a report was generated, false otherwise. */ static bool print_report(const volatile void *ptr, size_t size, int access_type, - bool value_change, int cpu_id, + enum kcsan_value_change value_change, int cpu_id, enum kcsan_report_type type) { unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; @@ -258,7 +259,7 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, /* * Must check report filter rules before starting to print. */ - if (skip_report(true, stack_entries[skipnr])) + if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr])) return false; if (type == KCSAN_REPORT_RACE_SIGNAL) { @@ -477,7 +478,8 @@ retry: } void kcsan_report(const volatile void *ptr, size_t size, int access_type, - bool value_change, int cpu_id, enum kcsan_report_type type) + enum kcsan_value_change value_change, int cpu_id, + enum kcsan_report_type type) { unsigned long flags = 0; -- cgit v1.2.3 From 81af89e15862909881ff010a0adb67148487e88a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:22 +0100 Subject: kcsan: Add kcsan_set_access_mask() support When setting up an access mask with kcsan_set_access_mask(), KCSAN will only report races if concurrent changes to bits set in access_mask are observed. Conveying access_mask via a separate call avoids introducing overhead in the common-case fast-path. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/core.c | 43 +++++++++++++++++++++++++++++++++++++++---- kernel/kcsan/kcsan.h | 5 +++++ kernel/kcsan/report.c | 13 ++++++++++++- 3 files changed, 56 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3f89801161d3..589b1e7f0f25 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -39,6 +39,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { .atomic_next = 0, .atomic_nest_count = 0, .in_flat_atomic = false, + .access_mask = 0, }; /* @@ -298,6 +299,15 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, if (!kcsan_is_enabled()) return; + + /* + * The access_mask check relies on value-change comparison. To avoid + * reporting a race where e.g. the writer set up the watchpoint, but the + * reader has access_mask!=0, we have to ignore the found watchpoint. + */ + if (get_ctx()->access_mask != 0) + return; + /* * Consume the watchpoint as soon as possible, to minimize the chances * of !consumed. Consuming the watchpoint must always be guarded by @@ -341,6 +351,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) u32 _4; u64 _8; } expect_value; + unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; @@ -435,18 +446,27 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Re-read value, and check if it is as expected; if not, we infer a * racy access. */ + access_mask = get_ctx()->access_mask; switch (size) { case 1: expect_value._1 ^= READ_ONCE(*(const u8 *)ptr); + if (access_mask) + expect_value._1 &= (u8)access_mask; break; case 2: expect_value._2 ^= READ_ONCE(*(const u16 *)ptr); + if (access_mask) + expect_value._2 &= (u16)access_mask; break; case 4: expect_value._4 ^= READ_ONCE(*(const u32 *)ptr); + if (access_mask) + expect_value._4 &= (u32)access_mask; break; case 8: expect_value._8 ^= READ_ONCE(*(const u64 *)ptr); + if (access_mask) + expect_value._8 &= (u64)access_mask; break; default: break; /* ignore; we do not diff the values */ @@ -460,11 +480,20 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (!remove_watchpoint(watchpoint)) { /* * Depending on the access type, map a value_change of MAYBE to - * TRUE (require reporting). + * TRUE (always report) or FALSE (never report). */ - if (value_change == KCSAN_VALUE_CHANGE_MAYBE && (size > 8 || is_assert)) { - /* Always assume a value-change. */ - value_change = KCSAN_VALUE_CHANGE_TRUE; + if (value_change == KCSAN_VALUE_CHANGE_MAYBE) { + if (access_mask != 0) { + /* + * For access with access_mask, we require a + * value-change, as it is likely that races on + * ~access_mask bits are expected. + */ + value_change = KCSAN_VALUE_CHANGE_FALSE; + } else if (size > 8 || is_assert) { + /* Always assume a value-change. */ + value_change = KCSAN_VALUE_CHANGE_TRUE; + } } /* @@ -622,6 +651,12 @@ void kcsan_atomic_next(int n) } EXPORT_SYMBOL(kcsan_atomic_next); +void kcsan_set_access_mask(unsigned long mask) +{ + get_ctx()->access_mask = mask; +} +EXPORT_SYMBOL(kcsan_set_access_mask); + void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { check_access(ptr, size, type); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 83a79b08b550..892de5120c1b 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -98,6 +98,11 @@ enum kcsan_value_change { */ KCSAN_VALUE_CHANGE_MAYBE, + /* + * Did not observe a value-change, and it is invalid to report the race. + */ + KCSAN_VALUE_CHANGE_FALSE, + /* * The value was observed to change, and the race should be reported. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index d871476dc134..11c791b886f3 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -132,6 +132,9 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) static bool skip_report(enum kcsan_value_change value_change, unsigned long top_frame) { + /* Should never get here if value_change==FALSE. */ + WARN_ON_ONCE(value_change == KCSAN_VALUE_CHANGE_FALSE); + /* * The first call to skip_report always has value_change==TRUE, since we * cannot know the value written of an instrumented access. For the 2nd @@ -493,7 +496,15 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, kcsan_disable_current(); if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { - if (print_report(ptr, size, access_type, value_change, cpu_id, type) && panic_on_warn) + /* + * Never report if value_change is FALSE, only if we it is + * either TRUE or MAYBE. In case of MAYBE, further filtering may + * be done once we know the full stack trace in print_report(). + */ + bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE && + print_report(ptr, size, access_type, value_change, cpu_id, type); + + if (reported && panic_on_warn) panic("panic_on_warn set ...\n"); release_report(&flags, type); -- cgit v1.2.3 From 703b321501c95c658275fd76775282fe45989641 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:23 +0100 Subject: kcsan: Introduce ASSERT_EXCLUSIVE_BITS(var, mask) This introduces ASSERT_EXCLUSIVE_BITS(var, mask). ASSERT_EXCLUSIVE_BITS(var, mask) will cause KCSAN to assume that the following access is safe w.r.t. data races (however, please see the docbook comment for disclaimer here). For more context on why this was considered necessary, please see: http://lkml.kernel.org/r/1580995070-25139-1-git-send-email-cai@lca.pw In particular, before this patch, data races between reads (that use @mask bits of an access that should not be modified concurrently) and writes (that change ~@mask bits not used by the readers) would have been annotated with "data_race()" (or "READ_ONCE()"). However, doing so would then hide real problems: we would no longer be able to detect harmful races between reads to @mask bits and writes to @mask bits. Therefore, by using ASSERT_EXCLUSIVE_BITS(var, mask), we accomplish: 1. Avoid proliferation of specific macros at the call sites: by including a single mask in the argument list, we can use the same macro in a wide variety of call sites, regardless of how and which bits in a field each call site actually accesses. 2. The existing code does not need to be modified (although READ_ONCE() may still be advisable if we cannot prove that the data race is always safe). 3. We catch bugs where the exclusive bits are modified concurrently. 4. We document properties of the current code. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Cc: David Hildenbrand Cc: Jan Kara Cc: Qian Cai --- kernel/kcsan/debugfs.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 9bbba0e57c9b..2ff196123977 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -100,8 +100,10 @@ static noinline void microbenchmark(unsigned long iters) * debugfs file from multiple tasks to generate real conflicts and show reports. */ static long test_dummy; +static long test_flags; static noinline void test_thread(unsigned long iters) { + const long CHANGE_BITS = 0xff00ff00ff00ff00L; const struct kcsan_ctx ctx_save = current->kcsan_ctx; cycles_t cycles; @@ -109,16 +111,27 @@ static noinline void test_thread(unsigned long iters) memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + pr_info("test_dummy@%px, test_flags@%px\n", &test_dummy, &test_flags); cycles = get_cycles(); while (iters--) { + /* These all should generate reports. */ __kcsan_check_read(&test_dummy, sizeof(test_dummy)); - __kcsan_check_write(&test_dummy, sizeof(test_dummy)); ASSERT_EXCLUSIVE_WRITER(test_dummy); ASSERT_EXCLUSIVE_ACCESS(test_dummy); + ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */ + __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ + + ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */ + __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ + /* not actually instrumented */ WRITE_ONCE(test_dummy, iters); /* to observe value-change */ + __kcsan_check_write(&test_dummy, sizeof(test_dummy)); + + test_flags ^= CHANGE_BITS; /* generate value-change */ + __kcsan_check_write(&test_flags, sizeof(test_flags)); } cycles = get_cycles() - cycles; -- cgit v1.2.3 From f5d2313bd3c540be405c4977a63840cd6d0167b5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 14 Feb 2020 22:10:35 +0100 Subject: kcsan, trace: Make KCSAN compatible with tracing Previously the system would lock up if ftrace was enabled together with KCSAN. This is due to recursion on reporting if the tracer code is instrumented with KCSAN. To avoid this for all types of tracing, disable KCSAN instrumentation for all of kernel/trace. Furthermore, since KCSAN relies on udelay() to introduce delay, we have to disable ftrace for udelay() (currently done for x86) in case KCSAN is used together with lockdep and ftrace. The reason is that it may corrupt lockdep IRQ flags tracing state due to a peculiar case of recursion (details in Makefile comment). Reported-by: Qian Cai Tested-by: Qian Cai Acked-by: Steven Rostedt (VMware) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/kcsan/Makefile | 2 ++ kernel/trace/Makefile | 3 +++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index df6b7799e492..d4999b38d1be 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -4,6 +4,8 @@ KCOV_INSTRUMENT := n UBSAN_SANITIZE := n CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ $(call cc-option,-fno-stack-protector,) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f9dcd19165fa..6b601d88bf71 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -6,6 +6,9 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) +# Avoid recursion due to instrumentation. +KCSAN_SANITIZE := n + ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE) -- cgit v1.2.3 From 48b1fc190a180d971fb69217c88c7247f4f2ca19 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 21 Feb 2020 23:02:09 +0100 Subject: kcsan: Add option to allow watcher interruptions Add option to allow interrupts while a watchpoint is set up. This can be enabled either via CONFIG_KCSAN_INTERRUPT_WATCHER or via the boot parameter 'kcsan.interrupt_watcher=1'. Note that, currently not all safe per-CPU access primitives and patterns are accounted for, which could result in false positives. For example, asm-generic/percpu.h uses plain operations, which by default are instrumented. On interrupts and subsequent accesses to the same variable, KCSAN would currently report a data race with this option. Therefore, this option should currently remain disabled by default, but may be enabled for specific test scenarios. To avoid new warnings, changes all uses of smp_processor_id() to use the raw version (as already done in kcsan_found_watchpoint()). The exact SMP processor id is for informational purposes in the report, and correctness is not affected. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 589b1e7f0f25..e7387fec6679 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -21,6 +21,7 @@ static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); static unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; static unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH; +static bool kcsan_interrupt_watcher = IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX @@ -30,6 +31,7 @@ module_param_named(early_enable, kcsan_early_enable, bool, 0); module_param_named(udelay_task, kcsan_udelay_task, uint, 0644); module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644); module_param_named(skip_watch, kcsan_skip_watch, long, 0644); +module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444); bool kcsan_enabled; @@ -354,7 +356,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); - unsigned long irq_flags; + unsigned long irq_flags = 0; /* * Always reset kcsan_skip counter in slow-path to avoid underflow; see @@ -370,26 +372,9 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; } - /* - * Disable interrupts & preemptions to avoid another thread on the same - * CPU accessing memory locations for the set up watchpoint; this is to - * avoid reporting races to e.g. CPU-local data. - * - * An alternative would be adding the source CPU to the watchpoint - * encoding, and checking that watchpoint-CPU != this-CPU. There are - * several problems with this: - * 1. we should avoid stealing more bits from the watchpoint encoding - * as it would affect accuracy, as well as increase performance - * overhead in the fast-path; - * 2. if we are preempted, but there *is* a genuine data race, we - * would *not* report it -- since this is the common case (vs. - * CPU-local data accesses), it makes more sense (from a data race - * detection point of view) to simply disable preemptions to ensure - * as many tasks as possible run on other CPUs. - * - * Use raw versions, to avoid lockdep recursion via IRQ flags tracing. - */ - raw_local_irq_save(irq_flags); + if (!kcsan_interrupt_watcher) + /* Use raw to avoid lockdep recursion via IRQ flags tracing. */ + raw_local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -507,7 +492,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); - kcsan_report(ptr, size, type, value_change, smp_processor_id(), + kcsan_report(ptr, size, type, value_change, raw_smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ @@ -518,13 +503,14 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, - smp_processor_id(), + raw_smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); } kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: - raw_local_irq_restore(irq_flags); + if (!kcsan_interrupt_watcher) + raw_local_irq_restore(irq_flags); out: user_access_restore(ua_flags); } -- cgit v1.2.3 From 2402d0eae589a31ee7b1774cb220d84d0f5605b4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Sat, 22 Feb 2020 00:10:27 +0100 Subject: kcsan: Add option for verbose reporting Adds CONFIG_KCSAN_VERBOSE to optionally enable more verbose reports. Currently information about the reporting task's held locks and IRQ trace events are shown, if they are enabled. Signed-off-by: Marco Elver Suggested-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 4 +- kernel/kcsan/kcsan.h | 3 ++ kernel/kcsan/report.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 107 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index e7387fec6679..065615df88ea 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -18,8 +18,8 @@ #include "kcsan.h" static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); -static unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; -static unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; +unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; +unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH; static bool kcsan_interrupt_watcher = IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 892de5120c1b..e282f8b5749e 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -13,6 +13,9 @@ /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 +extern unsigned int kcsan_udelay_task; +extern unsigned int kcsan_udelay_interrupt; + /* * Globally enable and disable KCSAN. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 11c791b886f3..18f9d3bc93a5 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include #include @@ -31,7 +33,26 @@ static struct { int cpu_id; unsigned long stack_entries[NUM_STACK_ENTRIES]; int num_stack_entries; -} other_info = { .ptr = NULL }; + + /* + * Optionally pass @current. Typically we do not need to pass @current + * via @other_info since just @task_pid is sufficient. Passing @current + * has additional overhead. + * + * To safely pass @current, we must either use get_task_struct/ + * put_task_struct, or stall the thread that populated @other_info. + * + * We cannot rely on get_task_struct/put_task_struct in case + * release_report() races with a task being released, and would have to + * free it in release_report(). This may result in deadlock if we want + * to use KCSAN on the allocators. + * + * Since we also want to reliably print held locks for + * CONFIG_KCSAN_VERBOSE, the current implementation stalls the thread + * that populated @other_info until it has been consumed. + */ + struct task_struct *task; +} other_info; /* * Information about reported races; used to rate limit reporting. @@ -245,6 +266,16 @@ static int sym_strcmp(void *addr1, void *addr2) return strncmp(buf1, buf2, sizeof(buf1)); } +static void print_verbose_info(struct task_struct *task) +{ + if (!task) + return; + + pr_err("\n"); + debug_show_held_locks(task); + print_irqtrace_events(task); +} + /* * Returns true if a report was generated, false otherwise. */ @@ -319,6 +350,9 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, other_info.num_stack_entries - other_skipnr, 0); + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + print_verbose_info(other_info.task); + pr_err("\n"); pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", get_access_type(access_type), ptr, size, @@ -340,6 +374,9 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0); + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + print_verbose_info(current); + /* Print report footer. */ pr_err("\n"); pr_err("Reported by Kernel Concurrency Sanitizer on:\n"); @@ -357,6 +394,67 @@ static void release_report(unsigned long *flags, enum kcsan_report_type type) spin_unlock_irqrestore(&report_lock, *flags); } +/* + * Sets @other_info.task and awaits consumption of @other_info. + * + * Precondition: report_lock is held. + * Postcondition: report_lock is held. + */ +static void +set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) +{ + /* + * We may be instrumenting a code-path where current->state is already + * something other than TASK_RUNNING. + */ + const bool is_running = current->state == TASK_RUNNING; + /* + * To avoid deadlock in case we are in an interrupt here and this is a + * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a + * timeout to ensure this works in all contexts. + * + * Await approximately the worst case delay of the reporting thread (if + * we are not interrupted). + */ + int timeout = max(kcsan_udelay_task, kcsan_udelay_interrupt); + + other_info.task = current; + do { + if (is_running) { + /* + * Let lockdep know the real task is sleeping, to print + * the held locks (recall we turned lockdep off, so + * locking/unlocking @report_lock won't be recorded). + */ + set_current_state(TASK_UNINTERRUPTIBLE); + } + spin_unlock_irqrestore(&report_lock, *flags); + /* + * We cannot call schedule() since we also cannot reliably + * determine if sleeping here is permitted -- see in_atomic(). + */ + + udelay(1); + spin_lock_irqsave(&report_lock, *flags); + if (timeout-- < 0) { + /* + * Abort. Reset other_info.task to NULL, since it + * appears the other thread is still going to consume + * it. It will result in no verbose info printed for + * this task. + */ + other_info.task = NULL; + break; + } + /* + * If @ptr nor @current matches, then our information has been + * consumed and we may continue. If not, retry. + */ + } while (other_info.ptr == ptr && other_info.task == current); + if (is_running) + set_current_state(TASK_RUNNING); +} + /* * Depending on the report type either sets other_info and returns false, or * acquires the matching other_info and returns true. If other_info is not @@ -388,6 +486,9 @@ retry: other_info.cpu_id = cpu_id; other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + set_other_info_task_blocking(flags, ptr); + spin_unlock_irqrestore(&report_lock, *flags); /* -- cgit v1.2.3 From 44656d3dc4f0dc20010d054f27397a4a1469fabf Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 25 Feb 2020 15:32:58 +0100 Subject: kcsan: Add current->state to implicitly atomic accesses Add volatile current->state to list of implicitly atomic accesses. This is in preparation to eventually enable KCSAN on kernel/sched (which currently still has KCSAN_SANITIZE := n). Since accesses that match the special check in atomic.h are rare, it makes more sense to move this check to the slow-path, avoiding the additional compare in the fast-path. With the microbenchmark, a speedup of ~6% is measured. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/atomic.h | 21 +++++++-------------- kernel/kcsan/core.c | 22 +++++++++++++++------- kernel/kcsan/debugfs.c | 27 ++++++++++++++++++--------- 3 files changed, 40 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index a9c193053491..be9e625227f3 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -4,24 +4,17 @@ #define _KERNEL_KCSAN_ATOMIC_H #include +#include /* - * Helper that returns true if access to @ptr should be considered an atomic - * access, even though it is not explicitly atomic. - * - * List all volatile globals that have been observed in races, to suppress - * data race reports between accesses to these variables. - * - * For now, we assume that volatile accesses of globals are as strong as atomic - * accesses (READ_ONCE, WRITE_ONCE cast to volatile). The situation is still not - * entirely clear, as on some architectures (Alpha) READ_ONCE/WRITE_ONCE do more - * than cast to volatile. Eventually, we hope to be able to remove this - * function. + * Special rules for certain memory where concurrent conflicting accesses are + * common, however, the current convention is to not mark them; returns true if + * access to @ptr should be considered atomic. Called from slow-path. */ -static __always_inline bool kcsan_is_atomic(const volatile void *ptr) +static bool kcsan_is_atomic_special(const volatile void *ptr) { - /* only jiffies for now */ - return ptr == &jiffies; + /* volatile globals that have been observed in data races. */ + return ptr == &jiffies || ptr == ¤t->state; } #endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 065615df88ea..eb30ecdc8c00 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -188,12 +188,13 @@ static __always_inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } +/* Rules for generic atomic accesses. Called from fast-path. */ static __always_inline bool is_atomic(const volatile void *ptr, size_t size, int type) { struct kcsan_ctx *ctx; - if ((type & KCSAN_ACCESS_ATOMIC) != 0) + if (type & KCSAN_ACCESS_ATOMIC) return true; /* @@ -201,16 +202,16 @@ is_atomic(const volatile void *ptr, size_t size, int type) * as atomic. This allows using them also in atomic regions, such as * seqlocks, without implicitly changing their semantics. */ - if ((type & KCSAN_ACCESS_ASSERT) != 0) + if (type & KCSAN_ACCESS_ASSERT) return false; if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && - (type & KCSAN_ACCESS_WRITE) != 0 && size <= sizeof(long) && + (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) && IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ ctx = get_ctx(); - if (unlikely(ctx->atomic_next > 0)) { + if (ctx->atomic_next > 0) { /* * Because we do not have separate contexts for nested * interrupts, in case atomic_next is set, we simply assume that @@ -224,10 +225,8 @@ is_atomic(const volatile void *ptr, size_t size, int type) --ctx->atomic_next; /* in task, or outer interrupt */ return true; } - if (unlikely(ctx->atomic_nest_count > 0 || ctx->in_flat_atomic)) - return true; - return kcsan_is_atomic(ptr); + return ctx->atomic_nest_count > 0 || ctx->in_flat_atomic; } static __always_inline bool @@ -367,6 +366,15 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (!kcsan_is_enabled()) goto out; + /* + * Special atomic rules: unlikely to be true, so we check them here in + * the slow-path, and not in the fast-path in is_atomic(). Call after + * kcsan_is_enabled(), as we may access memory that is not yet + * initialized during early boot. + */ + if (!is_assert && kcsan_is_atomic_special(ptr)) + goto out; + if (!check_encodable((unsigned long)ptr, size)) { kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); goto out; diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 2ff196123977..72ee188ebc54 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -74,25 +74,34 @@ void kcsan_counter_dec(enum kcsan_counter_id id) */ static noinline void microbenchmark(unsigned long iters) { + const struct kcsan_ctx ctx_save = current->kcsan_ctx; + const bool was_enabled = READ_ONCE(kcsan_enabled); cycles_t cycles; + /* We may have been called from an atomic region; reset context. */ + memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); + /* + * Disable to benchmark fast-path for all accesses, and (expected + * negligible) call into slow-path, but never set up watchpoints. + */ + WRITE_ONCE(kcsan_enabled, false); + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); cycles = get_cycles(); while (iters--) { - /* - * We can run this benchmark from multiple tasks; this address - * calculation increases likelyhood of some accesses - * overlapping. Make the access type an atomic read, to never - * set up watchpoints and test the fast-path only. - */ - unsigned long addr = - iters % (CONFIG_KCSAN_NUM_WATCHPOINTS * PAGE_SIZE); - __kcsan_check_access((void *)addr, sizeof(long), KCSAN_ACCESS_ATOMIC); + unsigned long addr = iters & ((PAGE_SIZE << 8) - 1); + int type = !(iters & 0x7f) ? KCSAN_ACCESS_ATOMIC : + (!(iters & 0xf) ? KCSAN_ACCESS_WRITE : 0); + __kcsan_check_access((void *)addr, sizeof(long), type); } cycles = get_cycles() - cycles; pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + + WRITE_ONCE(kcsan_enabled, was_enabled); + /* restore context */ + current->kcsan_ctx = ctx_save; } /* -- cgit v1.2.3 From e7b34100500733f7e052ce3dee94e6338b86e6bc Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 5 Mar 2020 15:21:07 +0100 Subject: kcsan: Fix a typo in a comment s/slots slots/slots/ Signed-off-by: Qiujun Huang Reviewed-by: Nick Desaulniers [elver: commit message] Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index eb30ecdc8c00..ee8200835b60 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -45,7 +45,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { }; /* - * Helper macros to index into adjacent slots slots, starting from address slot + * Helper macros to index into adjacent slots, starting from address slot * itself, followed by the right and left slots. * * The purpose is 2-fold: -- cgit v1.2.3 From d8ef4b38cb69d907f9b0e889c44d05fc0f890977 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Apr 2020 14:55:35 -0400 Subject: Revert "cgroup: Add memory barriers to plug cgroup_rstat_updated() race window" This reverts commit 9a9e97b2f1f2 ("cgroup: Add memory barriers to plug cgroup_rstat_updated() race window"). The commit was added in anticipation of memcg rstat conversion which needed synchronous accounting for the event counters (e.g. oom kill count). However, the conversion didn't get merged due to percpu memory overhead concern which couldn't be addressed at the time. Unfortunately, the patch's addition of smp_mb() to cgroup_rstat_updated() meant that every scheduling event now had to go through an additional full barrier and Mel Gorman noticed it as 1% regression in netperf UDP_STREAM test. There's no need to have this barrier in tree now and even if we need synchronous accounting in the future, the right thing to do is separating that out to a separate function so that hot paths which don't care about synchronous behavior don't have to pay the overhead of the full barrier. Let's revert. Signed-off-by: Tejun Heo Reported-by: Mel Gorman Link: http://lkml.kernel.org/r/20200409154413.GK3818@techsingularity.net Cc: v4.18+ --- kernel/cgroup/rstat.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 6f87352f8219..41ca996568df 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -33,12 +33,9 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) return; /* - * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we - * see NULL updated_next or they see our updated stat. - */ - smp_mb(); - - /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * * Because @parent's updated_children is terminated with @parent * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. @@ -134,13 +131,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - /* - * Paired with the one in cgroup_rstat_cpu_updated(). - * Either they see NULL updated_next or we see their - * updated stat. - */ - smp_mb(); - return pos; } -- cgit v1.2.3 From 135c0872d86948046d11d7083e36c930cc43ac93 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 18 Mar 2020 18:38:44 +0100 Subject: kcsan: Introduce report access_info and other_info Improve readability by introducing access_info and other_info structs, and in preparation of the following commit in this series replaces the single instance of other_info with an array of size 1. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 6 +-- kernel/kcsan/kcsan.h | 2 +- kernel/kcsan/report.c | 147 +++++++++++++++++++++++++------------------------- 3 files changed, 77 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index ee8200835b60..f1c38620e3cf 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -321,7 +321,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { - kcsan_report(ptr, size, type, true, raw_smp_processor_id(), + kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, KCSAN_REPORT_CONSUMED_WATCHPOINT); } else { /* @@ -500,8 +500,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); - kcsan_report(ptr, size, type, value_change, raw_smp_processor_id(), - KCSAN_REPORT_RACE_SIGNAL); + kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ @@ -511,7 +510,6 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, - raw_smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index e282f8b5749e..6630dfe32f31 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -135,7 +135,7 @@ enum kcsan_report_type { * Print a race report from thread that encountered the race. */ extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, - enum kcsan_value_change value_change, int cpu_id, + enum kcsan_value_change value_change, enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 18f9d3bc93a5..de234d1c1b3d 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -19,18 +19,23 @@ */ #define NUM_STACK_ENTRIES 64 +/* Common access info. */ +struct access_info { + const volatile void *ptr; + size_t size; + int access_type; + int task_pid; + int cpu_id; +}; + /* * Other thread info: communicated from other racing thread to thread that set * up the watchpoint, which then prints the complete report atomically. Only * need one struct, as all threads should to be serialized regardless to print * the reports, with reporting being in the slow-path. */ -static struct { - const volatile void *ptr; - size_t size; - int access_type; - int task_pid; - int cpu_id; +struct other_info { + struct access_info ai; unsigned long stack_entries[NUM_STACK_ENTRIES]; int num_stack_entries; @@ -52,7 +57,9 @@ static struct { * that populated @other_info until it has been consumed. */ struct task_struct *task; -} other_info; +}; + +static struct other_info other_infos[1]; /* * Information about reported races; used to rate limit reporting. @@ -238,7 +245,7 @@ static const char *get_thread_desc(int task_id) } /* Helper to skip KCSAN-related functions in stack-trace. */ -static int get_stack_skipnr(unsigned long stack_entries[], int num_entries) +static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries) { char buf[64]; int skip = 0; @@ -279,9 +286,10 @@ static void print_verbose_info(struct task_struct *task) /* * Returns true if a report was generated, false otherwise. */ -static bool print_report(const volatile void *ptr, size_t size, int access_type, - enum kcsan_value_change value_change, int cpu_id, - enum kcsan_report_type type) +static bool print_report(enum kcsan_value_change value_change, + enum kcsan_report_type type, + const struct access_info *ai, + const struct other_info *other_info) { unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); @@ -297,9 +305,9 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, return false; if (type == KCSAN_REPORT_RACE_SIGNAL) { - other_skipnr = get_stack_skipnr(other_info.stack_entries, - other_info.num_stack_entries); - other_frame = other_info.stack_entries[other_skipnr]; + other_skipnr = get_stack_skipnr(other_info->stack_entries, + other_info->num_stack_entries); + other_frame = other_info->stack_entries[other_skipnr]; /* @value_change is only known for the other thread */ if (skip_report(value_change, other_frame)) @@ -321,13 +329,13 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, */ cmp = sym_strcmp((void *)other_frame, (void *)this_frame); pr_err("BUG: KCSAN: %s in %ps / %ps\n", - get_bug_type(access_type | other_info.access_type), + get_bug_type(ai->access_type | other_info->ai.access_type), (void *)(cmp < 0 ? other_frame : this_frame), (void *)(cmp < 0 ? this_frame : other_frame)); } break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: - pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(access_type), + pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type), (void *)this_frame); break; @@ -341,30 +349,28 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, switch (type) { case KCSAN_REPORT_RACE_SIGNAL: pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(other_info.access_type), other_info.ptr, - other_info.size, get_thread_desc(other_info.task_pid), - other_info.cpu_id); + get_access_type(other_info->ai.access_type), other_info->ai.ptr, + other_info->ai.size, get_thread_desc(other_info->ai.task_pid), + other_info->ai.cpu_id); /* Print the other thread's stack trace. */ - stack_trace_print(other_info.stack_entries + other_skipnr, - other_info.num_stack_entries - other_skipnr, + stack_trace_print(other_info->stack_entries + other_skipnr, + other_info->num_stack_entries - other_skipnr, 0); if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) - print_verbose_info(other_info.task); + print_verbose_info(other_info->task); pr_err("\n"); pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(access_type), ptr, size, - get_thread_desc(in_task() ? task_pid_nr(current) : -1), - cpu_id); + get_access_type(ai->access_type), ai->ptr, ai->size, + get_thread_desc(ai->task_pid), ai->cpu_id); break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n", - get_access_type(access_type), ptr, size, - get_thread_desc(in_task() ? task_pid_nr(current) : -1), - cpu_id); + get_access_type(ai->access_type), ai->ptr, ai->size, + get_thread_desc(ai->task_pid), ai->cpu_id); break; default: @@ -386,22 +392,23 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, return true; } -static void release_report(unsigned long *flags, enum kcsan_report_type type) +static void release_report(unsigned long *flags, struct other_info *other_info) { - if (type == KCSAN_REPORT_RACE_SIGNAL) - other_info.ptr = NULL; /* mark for reuse */ + if (other_info) + other_info->ai.ptr = NULL; /* Mark for reuse. */ spin_unlock_irqrestore(&report_lock, *flags); } /* - * Sets @other_info.task and awaits consumption of @other_info. + * Sets @other_info->task and awaits consumption of @other_info. * * Precondition: report_lock is held. * Postcondition: report_lock is held. */ -static void -set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) +static void set_other_info_task_blocking(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) { /* * We may be instrumenting a code-path where current->state is already @@ -418,7 +425,7 @@ set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) */ int timeout = max(kcsan_udelay_task, kcsan_udelay_interrupt); - other_info.task = current; + other_info->task = current; do { if (is_running) { /* @@ -438,19 +445,19 @@ set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) spin_lock_irqsave(&report_lock, *flags); if (timeout-- < 0) { /* - * Abort. Reset other_info.task to NULL, since it + * Abort. Reset @other_info->task to NULL, since it * appears the other thread is still going to consume * it. It will result in no verbose info printed for * this task. */ - other_info.task = NULL; + other_info->task = NULL; break; } /* * If @ptr nor @current matches, then our information has been * consumed and we may continue. If not, retry. */ - } while (other_info.ptr == ptr && other_info.task == current); + } while (other_info->ai.ptr == ai->ptr && other_info->task == current); if (is_running) set_current_state(TASK_RUNNING); } @@ -460,9 +467,8 @@ set_other_info_task_blocking(unsigned long *flags, const volatile void *ptr) * acquires the matching other_info and returns true. If other_info is not * required for the report type, simply acquires report_lock and returns true. */ -static bool prepare_report(unsigned long *flags, const volatile void *ptr, - size_t size, int access_type, int cpu_id, - enum kcsan_report_type type) +static bool prepare_report(unsigned long *flags, enum kcsan_report_type type, + const struct access_info *ai, struct other_info *other_info) { if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && type != KCSAN_REPORT_RACE_SIGNAL) { @@ -476,18 +482,14 @@ retry: switch (type) { case KCSAN_REPORT_CONSUMED_WATCHPOINT: - if (other_info.ptr != NULL) + if (other_info->ai.ptr) break; /* still in use, retry */ - other_info.ptr = ptr; - other_info.size = size; - other_info.access_type = access_type; - other_info.task_pid = in_task() ? task_pid_nr(current) : -1; - other_info.cpu_id = cpu_id; - other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); + other_info->ai = *ai; + other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 1); if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) - set_other_info_task_blocking(flags, ptr); + set_other_info_task_blocking(flags, ai, other_info); spin_unlock_irqrestore(&report_lock, *flags); @@ -498,37 +500,31 @@ retry: return false; case KCSAN_REPORT_RACE_SIGNAL: - if (other_info.ptr == NULL) + if (!other_info->ai.ptr) break; /* no data available yet, retry */ /* * First check if this is the other_info we are expecting, i.e. * matches based on how watchpoint was encoded. */ - if (!matching_access((unsigned long)other_info.ptr & - WATCHPOINT_ADDR_MASK, - other_info.size, - (unsigned long)ptr & WATCHPOINT_ADDR_MASK, - size)) + if (!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size, + (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size)) break; /* mismatching watchpoint, retry */ - if (!matching_access((unsigned long)other_info.ptr, - other_info.size, (unsigned long)ptr, - size)) { + if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size, + (unsigned long)ai->ptr, ai->size)) { /* * If the actual accesses to not match, this was a false * positive due to watchpoint encoding. */ - kcsan_counter_inc( - KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); /* discard this other_info */ - release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + release_report(flags, other_info); return false; } - access_type |= other_info.access_type; - if ((access_type & KCSAN_ACCESS_WRITE) == 0) { + if (!((ai->access_type | other_info->ai.access_type) & KCSAN_ACCESS_WRITE)) { /* * While the address matches, this is not the other_info * from the thread that consumed our watchpoint, since @@ -561,15 +557,11 @@ retry: * data, and at this point the likelihood that we * re-report the same race again is high. */ - release_report(flags, KCSAN_REPORT_RACE_SIGNAL); + release_report(flags, other_info); return false; } - /* - * Matching & usable access in other_info: keep other_info_lock - * locked, as this thread consumes it to print the full report; - * unlocked in release_report. - */ + /* Matching access in other_info. */ return true; default: @@ -582,10 +574,19 @@ retry: } void kcsan_report(const volatile void *ptr, size_t size, int access_type, - enum kcsan_value_change value_change, int cpu_id, + enum kcsan_value_change value_change, enum kcsan_report_type type) { unsigned long flags = 0; + const struct access_info ai = { + .ptr = ptr, + .size = size, + .access_type = access_type, + .task_pid = in_task() ? task_pid_nr(current) : -1, + .cpu_id = raw_smp_processor_id() + }; + struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN + ? NULL : &other_infos[0]; /* * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if @@ -596,19 +597,19 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, lockdep_off(); kcsan_disable_current(); - if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { + if (prepare_report(&flags, type, &ai, other_info)) { /* * Never report if value_change is FALSE, only if we it is * either TRUE or MAYBE. In case of MAYBE, further filtering may * be done once we know the full stack trace in print_report(). */ bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE && - print_report(ptr, size, access_type, value_change, cpu_id, type); + print_report(value_change, type, &ai, other_info); if (reported && panic_on_warn) panic("panic_on_warn set ...\n"); - release_report(&flags, type); + release_report(&flags, other_info); } kcsan_enable_current(); -- cgit v1.2.3 From 6119418f94ca5314392d258d27eb0cb58bb1774e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 18 Mar 2020 18:38:45 +0100 Subject: kcsan: Avoid blocking producers in prepare_report() To avoid deadlock in case watchers can be interrupted, we need to ensure that producers of the struct other_info can never be blocked by an unrelated consumer. (Likely to occur with KCSAN_INTERRUPT_WATCHER.) There are several cases that can lead to this scenario, for example: 1. A watchpoint A was set up by task T1, but interrupted by interrupt I1. Some other thread (task or interrupt) finds watchpoint A consumes it, and sets other_info. Then I1 also finds some unrelated watchpoint B, consumes it, but is blocked because other_info is in use. T1 cannot consume other_info because I1 never returns -> deadlock. 2. A watchpoint A was set up by task T1, but interrupted by interrupt I1, which also sets up a watchpoint B. Some other thread finds watchpoint A, and consumes it and sets up other_info with its information. Similarly some other thread finds watchpoint B and consumes it, but is then blocked because other_info is in use. When I1 continues it sees its watchpoint was consumed, and that it must wait for other_info, which currently contains information to be consumed by T1. However, T1 cannot unblock other_info because I1 never returns -> deadlock. To avoid this, we need to ensure that producers of struct other_info always have a usable other_info entry. This is obviously not the case with only a single instance of struct other_info, as concurrent producers must wait for the entry to be released by some consumer (which may be locked up as illustrated above). While it would be nice if producers could simply call kmalloc() and append their instance of struct other_info to a list, we are very limited in this code path: since KCSAN can instrument the allocators themselves, calling kmalloc() could lead to deadlock or corrupted allocator state. Since producers of the struct other_info will always succeed at try_consume_watchpoint(), preceding the call into kcsan_report(), we know that the particular watchpoint slot cannot simply be reused or consumed by another potential other_info producer. If we move removal of a watchpoint after reporting (by the consumer of struct other_info), we can see a consumed watchpoint as a held lock on elements of other_info, if we create a one-to-one mapping of a watchpoint to an other_info element. Therefore, the simplest solution is to create an array of struct other_info that is as large as the watchpoints array in core.c, and pass the watchpoint index to kcsan_report() for producers and consumers, and change watchpoints to be removed after reporting is done. With a default config on a 64-bit system, the array other_infos consumes ~37KiB. For most systems today this is not a problem. On smaller memory constrained systems, the config value CONFIG_KCSAN_NUM_WATCHPOINTS can be reduced appropriately. Overall, this change is a simplification of the prepare_report() code, and makes some of the checks (such as checking if at least one access is a write) redundant. Tested: $ tools/testing/selftests/rcutorture/bin/kvm.sh \ --cpus 12 --duration 10 --kconfig "CONFIG_DEBUG_INFO=y \ CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n \ CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n \ CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y \ CONFIG_KCSAN_INTERRUPT_WATCHER=y CONFIG_PROVE_LOCKING=y" \ --configs TREE03 => No longer hangs and runs to completion as expected. Reported-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 31 +++++--- kernel/kcsan/kcsan.h | 3 +- kernel/kcsan/report.c | 212 ++++++++++++++++++++++++-------------------------- 3 files changed, 124 insertions(+), 122 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index f1c38620e3cf..4d8ea0fca5f1 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -69,7 +69,6 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { * slot=9: [10, 11, 9] * slot=63: [64, 65, 63] */ -#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) #define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) /* @@ -171,12 +170,16 @@ try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); } -/* - * Return true if watchpoint was not touched, false if consumed. - */ -static inline bool remove_watchpoint(atomic_long_t *watchpoint) +/* Return true if watchpoint was not touched, false if already consumed. */ +static inline bool consume_watchpoint(atomic_long_t *watchpoint) { - return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; + return atomic_long_xchg_relaxed(watchpoint, CONSUMED_WATCHPOINT) != CONSUMED_WATCHPOINT; +} + +/* Remove the watchpoint -- its slot may be reused after. */ +static inline void remove_watchpoint(atomic_long_t *watchpoint) +{ + atomic_long_set(watchpoint, INVALID_WATCHPOINT); } static __always_inline struct kcsan_ctx *get_ctx(void) @@ -322,7 +325,8 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, if (consumed) { kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, - KCSAN_REPORT_CONSUMED_WATCHPOINT); + KCSAN_REPORT_CONSUMED_WATCHPOINT, + watchpoint - watchpoints); } else { /* * The other thread may not print any diagnostics, as it has @@ -470,7 +474,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) value_change = KCSAN_VALUE_CHANGE_TRUE; /* Check if this access raced with another. */ - if (!remove_watchpoint(watchpoint)) { + if (!consume_watchpoint(watchpoint)) { /* * Depending on the access type, map a value_change of MAYBE to * TRUE (always report) or FALSE (never report). @@ -500,7 +504,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); - kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL); + kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL, + watchpoint - watchpoints); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ @@ -510,9 +515,15 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, - KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, + watchpoint - watchpoints); } + /* + * Remove watchpoint; must be after reporting, since the slot may be + * reused after this point. + */ + remove_watchpoint(watchpoint); kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: if (!kcsan_interrupt_watcher) diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 6630dfe32f31..763d6d08d94b 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -12,6 +12,7 @@ /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 +#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) extern unsigned int kcsan_udelay_task; extern unsigned int kcsan_udelay_interrupt; @@ -136,6 +137,6 @@ enum kcsan_report_type { */ extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, enum kcsan_value_change value_change, - enum kcsan_report_type type); + enum kcsan_report_type type, int watchpoint_idx); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index de234d1c1b3d..ae0a383238ea 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -30,9 +30,7 @@ struct access_info { /* * Other thread info: communicated from other racing thread to thread that set - * up the watchpoint, which then prints the complete report atomically. Only - * need one struct, as all threads should to be serialized regardless to print - * the reports, with reporting being in the slow-path. + * up the watchpoint, which then prints the complete report atomically. */ struct other_info { struct access_info ai; @@ -59,7 +57,11 @@ struct other_info { struct task_struct *task; }; -static struct other_info other_infos[1]; +/* + * To never block any producers of struct other_info, we need as many elements + * as we have watchpoints (upper bound on concurrent races to report). + */ +static struct other_info other_infos[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; /* * Information about reported races; used to rate limit reporting. @@ -96,10 +98,11 @@ struct report_time { static struct report_time report_times[REPORT_TIMES_SIZE]; /* - * This spinlock protects reporting and other_info, since other_info is usually - * required when reporting. + * Spinlock serializing report generation, and access to @other_infos. Although + * it could make sense to have a finer-grained locking story for @other_infos, + * report generation needs to be serialized either way, so not much is gained. */ -static DEFINE_SPINLOCK(report_lock); +static DEFINE_RAW_SPINLOCK(report_lock); /* * Checks if the race identified by thread frames frame1 and frame2 has @@ -395,9 +398,13 @@ static bool print_report(enum kcsan_value_change value_change, static void release_report(unsigned long *flags, struct other_info *other_info) { if (other_info) - other_info->ai.ptr = NULL; /* Mark for reuse. */ + /* + * Use size to denote valid/invalid, since KCSAN entirely + * ignores 0-sized accesses. + */ + other_info->ai.size = 0; - spin_unlock_irqrestore(&report_lock, *flags); + raw_spin_unlock_irqrestore(&report_lock, *flags); } /* @@ -435,14 +442,14 @@ static void set_other_info_task_blocking(unsigned long *flags, */ set_current_state(TASK_UNINTERRUPTIBLE); } - spin_unlock_irqrestore(&report_lock, *flags); + raw_spin_unlock_irqrestore(&report_lock, *flags); /* * We cannot call schedule() since we also cannot reliably * determine if sleeping here is permitted -- see in_atomic(). */ udelay(1); - spin_lock_irqsave(&report_lock, *flags); + raw_spin_lock_irqsave(&report_lock, *flags); if (timeout-- < 0) { /* * Abort. Reset @other_info->task to NULL, since it @@ -454,128 +461,107 @@ static void set_other_info_task_blocking(unsigned long *flags, break; } /* - * If @ptr nor @current matches, then our information has been - * consumed and we may continue. If not, retry. + * If invalid, or @ptr nor @current matches, then @other_info + * has been consumed and we may continue. If not, retry. */ - } while (other_info->ai.ptr == ai->ptr && other_info->task == current); + } while (other_info->ai.size && other_info->ai.ptr == ai->ptr && + other_info->task == current); if (is_running) set_current_state(TASK_RUNNING); } -/* - * Depending on the report type either sets other_info and returns false, or - * acquires the matching other_info and returns true. If other_info is not - * required for the report type, simply acquires report_lock and returns true. - */ -static bool prepare_report(unsigned long *flags, enum kcsan_report_type type, - const struct access_info *ai, struct other_info *other_info) +/* Populate @other_info; requires that the provided @other_info not in use. */ +static void prepare_report_producer(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) { - if (type != KCSAN_REPORT_CONSUMED_WATCHPOINT && - type != KCSAN_REPORT_RACE_SIGNAL) { - /* other_info not required; just acquire report_lock */ - spin_lock_irqsave(&report_lock, *flags); - return true; - } + raw_spin_lock_irqsave(&report_lock, *flags); -retry: - spin_lock_irqsave(&report_lock, *flags); + /* + * The same @other_infos entry cannot be used concurrently, because + * there is a one-to-one mapping to watchpoint slots (@watchpoints in + * core.c), and a watchpoint is only released for reuse after reporting + * is done by the consumer of @other_info. Therefore, it is impossible + * for another concurrent prepare_report_producer() to set the same + * @other_info, and are guaranteed exclusivity for the @other_infos + * entry pointed to by @other_info. + * + * To check this property holds, size should never be non-zero here, + * because every consumer of struct other_info resets size to 0 in + * release_report(). + */ + WARN_ON(other_info->ai.size); - switch (type) { - case KCSAN_REPORT_CONSUMED_WATCHPOINT: - if (other_info->ai.ptr) - break; /* still in use, retry */ + other_info->ai = *ai; + other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 2); - other_info->ai = *ai; - other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 1); + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + set_other_info_task_blocking(flags, ai, other_info); - if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) - set_other_info_task_blocking(flags, ai, other_info); + raw_spin_unlock_irqrestore(&report_lock, *flags); +} - spin_unlock_irqrestore(&report_lock, *flags); +/* Awaits producer to fill @other_info and then returns. */ +static bool prepare_report_consumer(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) +{ - /* - * The other thread will print the summary; other_info may now - * be consumed. - */ - return false; + raw_spin_lock_irqsave(&report_lock, *flags); + while (!other_info->ai.size) { /* Await valid @other_info. */ + raw_spin_unlock_irqrestore(&report_lock, *flags); + cpu_relax(); + raw_spin_lock_irqsave(&report_lock, *flags); + } - case KCSAN_REPORT_RACE_SIGNAL: - if (!other_info->ai.ptr) - break; /* no data available yet, retry */ + /* Should always have a matching access based on watchpoint encoding. */ + if (WARN_ON(!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size, + (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size))) + goto discard; + if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size, + (unsigned long)ai->ptr, ai->size)) { /* - * First check if this is the other_info we are expecting, i.e. - * matches based on how watchpoint was encoded. + * If the actual accesses to not match, this was a false + * positive due to watchpoint encoding. */ - if (!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size, - (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size)) - break; /* mismatching watchpoint, retry */ - - if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size, - (unsigned long)ai->ptr, ai->size)) { - /* - * If the actual accesses to not match, this was a false - * positive due to watchpoint encoding. - */ - kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); - - /* discard this other_info */ - release_report(flags, other_info); - return false; - } + kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + goto discard; + } - if (!((ai->access_type | other_info->ai.access_type) & KCSAN_ACCESS_WRITE)) { - /* - * While the address matches, this is not the other_info - * from the thread that consumed our watchpoint, since - * neither this nor the access in other_info is a write. - * It is invalid to continue with the report, since we - * only have information about reads. - * - * This can happen due to concurrent races on the same - * address, with at least 4 threads. To avoid locking up - * other_info and all other threads, we have to consume - * it regardless. - * - * A concrete case to illustrate why we might lock up if - * we do not consume other_info: - * - * We have 4 threads, all accessing the same address - * (or matching address ranges). Assume the following - * watcher and watchpoint consumer pairs: - * write1-read1, read2-write2. The first to populate - * other_info is write2, however, write1 consumes it, - * resulting in a report of write1-write2. This report - * is valid, however, now read1 populates other_info; - * read2-read1 is an invalid conflict, yet, no other - * conflicting access is left. Therefore, we must - * consume read1's other_info. - * - * Since this case is assumed to be rare, it is - * reasonable to omit this report: one of the other - * reports includes information about the same shared - * data, and at this point the likelihood that we - * re-report the same race again is high. - */ - release_report(flags, other_info); - return false; - } + return true; - /* Matching access in other_info. */ - return true; +discard: + release_report(flags, other_info); + return false; +} +/* + * Depending on the report type either sets @other_info and returns false, or + * awaits @other_info and returns true. If @other_info is not required for the + * report type, simply acquires @report_lock and returns true. + */ +static noinline bool prepare_report(unsigned long *flags, + enum kcsan_report_type type, + const struct access_info *ai, + struct other_info *other_info) +{ + switch (type) { + case KCSAN_REPORT_CONSUMED_WATCHPOINT: + prepare_report_producer(flags, ai, other_info); + return false; + case KCSAN_REPORT_RACE_SIGNAL: + return prepare_report_consumer(flags, ai, other_info); default: - BUG(); + /* @other_info not required; just acquire @report_lock. */ + raw_spin_lock_irqsave(&report_lock, *flags); + return true; } - - spin_unlock_irqrestore(&report_lock, *flags); - - goto retry; } void kcsan_report(const volatile void *ptr, size_t size, int access_type, enum kcsan_value_change value_change, - enum kcsan_report_type type) + enum kcsan_report_type type, int watchpoint_idx) { unsigned long flags = 0; const struct access_info ai = { @@ -586,7 +572,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, .cpu_id = raw_smp_processor_id() }; struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN - ? NULL : &other_infos[0]; + ? NULL : &other_infos[watchpoint_idx]; + + kcsan_disable_current(); + if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= ARRAY_SIZE(other_infos))) + goto out; /* * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if @@ -596,7 +586,6 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, */ lockdep_off(); - kcsan_disable_current(); if (prepare_report(&flags, type, &ai, other_info)) { /* * Never report if value_change is FALSE, only if we it is @@ -611,7 +600,8 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, release_report(&flags, other_info); } - kcsan_enable_current(); lockdep_on(); +out: + kcsan_enable_current(); } -- cgit v1.2.3 From 757a4cefde76697af2b2c284c8a320912b77e7e6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 25 Mar 2020 17:41:56 +0100 Subject: kcsan: Add support for scoped accesses This adds support for scoped accesses, where the memory range is checked for the duration of the scope. The feature is implemented by inserting the relevant access information into a list of scoped accesses for the current execution context, which are then checked (until removed) on every call (through instrumentation) into the KCSAN runtime. An alternative, more complex, implementation could set up a watchpoint for the scoped access, and keep the watchpoint set up. This, however, would require first exposing a handle to the watchpoint, as well as dealing with cases such as accesses by the same thread while the watchpoint is still set up (and several more cases). It is also doubtful if this would provide any benefit, since the majority of delay where the watchpoint is set up is likely due to the injected delays by KCSAN. Therefore, the implementation in this patch is simpler and avoids hurting KCSAN's main use-case (normal data race detection); it also implicitly increases scoped-access race-detection-ability due to increased probability of setting up watchpoints by repeatedly calling __kcsan_check_access() throughout the scope of the access. The implementation required adding an additional conditional branch to the fast-path. However, the microbenchmark showed a *speedup* of ~5% on the fast-path. This appears to be due to subtly improved codegen by GCC from moving get_ctx() and associated load of preempt_count earlier. Suggested-by: Boqun Feng Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++----- kernel/kcsan/report.c | 33 +++++++++++++------- 2 files changed, 97 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4d8ea0fca5f1..a572aae61b98 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { .atomic_nest_count = 0, .in_flat_atomic = false, .access_mask = 0, + .scoped_accesses = {LIST_POISON1, NULL}, }; /* @@ -191,12 +193,23 @@ static __always_inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } +/* Check scoped accesses; never inline because this is a slow-path! */ +static noinline void kcsan_check_scoped_accesses(void) +{ + struct kcsan_ctx *ctx = get_ctx(); + struct list_head *prev_save = ctx->scoped_accesses.prev; + struct kcsan_scoped_access *scoped_access; + + ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */ + list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) + __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type); + ctx->scoped_accesses.prev = prev_save; +} + /* Rules for generic atomic accesses. Called from fast-path. */ static __always_inline bool -is_atomic(const volatile void *ptr, size_t size, int type) +is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx) { - struct kcsan_ctx *ctx; - if (type & KCSAN_ACCESS_ATOMIC) return true; @@ -213,7 +226,6 @@ is_atomic(const volatile void *ptr, size_t size, int type) IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ - ctx = get_ctx(); if (ctx->atomic_next > 0) { /* * Because we do not have separate contexts for nested @@ -233,7 +245,7 @@ is_atomic(const volatile void *ptr, size_t size, int type) } static __always_inline bool -should_watch(const volatile void *ptr, size_t size, int type) +should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx) { /* * Never set up watchpoints when memory operations are atomic. @@ -242,7 +254,7 @@ should_watch(const volatile void *ptr, size_t size, int type) * should not count towards skipped instructions, and (2) to actually * decrement kcsan_atomic_next for consecutive instruction stream. */ - if (is_atomic(ptr, size, type)) + if (is_atomic(ptr, size, type, ctx)) return false; if (this_cpu_dec_return(kcsan_skip) >= 0) @@ -563,8 +575,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, if (unlikely(watchpoint != NULL)) kcsan_found_watchpoint(ptr, size, type, watchpoint, encoded_watchpoint); - else if (unlikely(should_watch(ptr, size, type))) - kcsan_setup_watchpoint(ptr, size, type); + else { + struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */ + + if (unlikely(should_watch(ptr, size, type, ctx))) + kcsan_setup_watchpoint(ptr, size, type); + else if (unlikely(ctx->scoped_accesses.prev)) + kcsan_check_scoped_accesses(); + } } /* === Public interface ===================================================== */ @@ -660,6 +678,55 @@ void kcsan_set_access_mask(unsigned long mask) } EXPORT_SYMBOL(kcsan_set_access_mask); +struct kcsan_scoped_access * +kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type, + struct kcsan_scoped_access *sa) +{ + struct kcsan_ctx *ctx = get_ctx(); + + __kcsan_check_access(ptr, size, type); + + ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */ + + INIT_LIST_HEAD(&sa->list); + sa->ptr = ptr; + sa->size = size; + sa->type = type; + + if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */ + INIT_LIST_HEAD(&ctx->scoped_accesses); + list_add(&sa->list, &ctx->scoped_accesses); + + ctx->disable_count--; + return sa; +} +EXPORT_SYMBOL(kcsan_begin_scoped_access); + +void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) +{ + struct kcsan_ctx *ctx = get_ctx(); + + if (WARN(!ctx->scoped_accesses.prev, "Unbalanced %s()?", __func__)) + return; + + ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */ + + list_del(&sa->list); + if (list_empty(&ctx->scoped_accesses)) + /* + * Ensure we do not enter kcsan_check_scoped_accesses() + * slow-path if unnecessary, and avoids requiring list_empty() + * in the fast-path (to avoid a READ_ONCE() and potential + * uaccess warning). + */ + ctx->scoped_accesses.prev = NULL; + + ctx->disable_count--; + + __kcsan_check_access(sa->ptr, sa->size, sa->type); +} +EXPORT_SYMBOL(kcsan_end_scoped_access); + void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { check_access(ptr, size, type); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ae0a383238ea..ddc18f1224a4 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -205,6 +205,20 @@ skip_report(enum kcsan_value_change value_change, unsigned long top_frame) static const char *get_access_type(int type) { + if (type & KCSAN_ACCESS_ASSERT) { + if (type & KCSAN_ACCESS_SCOPED) { + if (type & KCSAN_ACCESS_WRITE) + return "assert no accesses (scoped)"; + else + return "assert no writes (scoped)"; + } else { + if (type & KCSAN_ACCESS_WRITE) + return "assert no accesses"; + else + return "assert no writes"; + } + } + switch (type) { case 0: return "read"; @@ -214,17 +228,14 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; - - /* - * ASSERT variants: - */ - case KCSAN_ACCESS_ASSERT: - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_ATOMIC: - return "assert no writes"; - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE: - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: - return "assert no accesses"; - + case KCSAN_ACCESS_SCOPED: + return "read (scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: + return "read (marked, scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE: + return "write (scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "write (marked, scoped)"; default: BUG(); } -- cgit v1.2.3 From d8949ef1d9f1062848cd068cf369a57ce33dae6f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 25 Mar 2020 17:41:58 +0100 Subject: kcsan: Introduce scoped ASSERT_EXCLUSIVE macros Introduce ASSERT_EXCLUSIVE_*_SCOPED(), which provide an intuitive interface to use the scoped-access feature, without having to explicitly mark the start and end of the desired scope. Basing duration of the checks on scope avoids accidental misuse and resulting false positives, which may be hard to debug. See added comments for usage. The macros are implemented using __attribute__((__cleanup__(func))), which is supported by all compilers that currently support KCSAN. Suggested-by: Boqun Feng Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 72ee188ebc54..1a08664a7fab 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -110,6 +110,7 @@ static noinline void microbenchmark(unsigned long iters) */ static long test_dummy; static long test_flags; +static long test_scoped; static noinline void test_thread(unsigned long iters) { const long CHANGE_BITS = 0xff00ff00ff00ff00L; @@ -120,7 +121,8 @@ static noinline void test_thread(unsigned long iters) memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); - pr_info("test_dummy@%px, test_flags@%px\n", &test_dummy, &test_flags); + pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n", + &test_dummy, &test_flags, &test_scoped); cycles = get_cycles(); while (iters--) { @@ -141,6 +143,18 @@ static noinline void test_thread(unsigned long iters) test_flags ^= CHANGE_BITS; /* generate value-change */ __kcsan_check_write(&test_flags, sizeof(test_flags)); + + BUG_ON(current->kcsan_ctx.scoped_accesses.prev); + { + /* Should generate reports anywhere in this block. */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped); + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped); + BUG_ON(!current->kcsan_ctx.scoped_accesses.prev); + /* Unrelated accesses. */ + __kcsan_check_access(&cycles, sizeof(cycles), 0); + __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC); + } + BUG_ON(current->kcsan_ctx.scoped_accesses.prev); } cycles = get_cycles() - cycles; -- cgit v1.2.3 From f770ed10a9ee65529f3ec8d90eb374bbd8b7c238 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 10 Apr 2020 18:44:17 +0200 Subject: kcsan: Fix function matching in report Pass string length as returned by scnprintf() to strnstr(), since strnstr() searches exactly len bytes in haystack, even if it contains a NUL-terminator before haystack+len. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/report.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ddc18f1224a4..cf41d63dd0cd 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -192,11 +192,11 @@ skip_report(enum kcsan_value_change value_change, unsigned long top_frame) * maintainers. */ char buf[64]; + int len = scnprintf(buf, sizeof(buf), "%ps", (void *)top_frame); - snprintf(buf, sizeof(buf), "%ps", (void *)top_frame); - if (!strnstr(buf, "rcu_", sizeof(buf)) && - !strnstr(buf, "_rcu", sizeof(buf)) && - !strnstr(buf, "_srcu", sizeof(buf))) + if (!strnstr(buf, "rcu_", len) && + !strnstr(buf, "_rcu", len) && + !strnstr(buf, "_srcu", len)) return true; } @@ -262,15 +262,15 @@ static const char *get_thread_desc(int task_id) static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries) { char buf[64]; + int len; int skip = 0; for (; skip < num_entries; ++skip) { - snprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); - if (!strnstr(buf, "csan_", sizeof(buf)) && - !strnstr(buf, "tsan_", sizeof(buf)) && - !strnstr(buf, "_once_size", sizeof(buf))) { + len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); + if (!strnstr(buf, "csan_", len) && + !strnstr(buf, "tsan_", len) && + !strnstr(buf, "_once_size", len)) break; - } } return skip; } -- cgit v1.2.3 From cdb9b07d8c78be63d72aba9a2686ff161ddd2099 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 10 Apr 2020 18:44:18 +0200 Subject: kcsan: Make reporting aware of KCSAN tests Reporting hides KCSAN runtime functions in the stack trace, with filtering done based on function names. Currently this included all functions (or modules) that would match "kcsan_". Make the filter aware of KCSAN tests, which contain "kcsan_test", and are no longer skipped in the report. This is in preparation for adding a KCSAN test module. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/report.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index cf41d63dd0cd..ac5f8345bae9 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -262,16 +262,32 @@ static const char *get_thread_desc(int task_id) static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries) { char buf[64]; - int len; - int skip = 0; + char *cur; + int len, skip; - for (; skip < num_entries; ++skip) { + for (skip = 0; skip < num_entries; ++skip) { len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); - if (!strnstr(buf, "csan_", len) && - !strnstr(buf, "tsan_", len) && - !strnstr(buf, "_once_size", len)) - break; + + /* Never show tsan_* or {read,write}_once_size. */ + if (strnstr(buf, "tsan_", len) || + strnstr(buf, "_once_size", len)) + continue; + + cur = strnstr(buf, "kcsan_", len); + if (cur) { + cur += sizeof("kcsan_") - 1; + if (strncmp(cur, "test", sizeof("test") - 1)) + continue; /* KCSAN runtime function. */ + /* KCSAN related test. */ + } + + /* + * No match for runtime functions -- @skip entries to skip to + * get to first frame of interest. + */ + break; } + return skip; } -- cgit v1.2.3 From 4c5b566c2193e2af82c891daa5303c8899e61044 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 31 Mar 2020 02:15:44 +0800 Subject: crash_dump: Remove no longer used saved_max_pfn saved_max_pfn was originally introduced in commit 92aa63a5a1bf ("[PATCH] kdump: Retrieve saved max pfn") It used to make sure that the user does not try to read the physical memory beyond saved_max_pfn. But since commit 921d58c0e699 ("vmcore: remove saved_max_pfn check") it's no longer used for the check. This variable doesn't have any users anymore so just remove it. [ bp: Drop the Calgary IOMMU reference from the commit message. ] Signed-off-by: Kairui Song Signed-off-by: Borislav Petkov Acked-by: "Eric W. Biederman" Link: https://lkml.kernel.org/r/20200330181544.1595733-1-kasong@redhat.com --- kernel/crash_dump.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 9c23ae074b40..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -5,12 +5,6 @@ #include #include -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - /* * stores the physical address of elf header of crash image * -- cgit v1.2.3 From 10415533a9062c9e5e27c421d9c232f0cae108fd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 22 Jan 2020 19:36:29 +0000 Subject: gcov: Remove old GCC 3.4 support The kernel requires at least GCC 4.8 in order to build, and so there is no need to cater for the pre-4.7 gcov format. Remove the obsolete code. Acked-by: Peter Oberparleiter Reviewed-by: Nick Desaulniers Signed-off-by: Will Deacon --- kernel/gcov/Kconfig | 24 --- kernel/gcov/Makefile | 3 +- kernel/gcov/gcc_3_4.c | 573 -------------------------------------------------- 3 files changed, 1 insertion(+), 599 deletions(-) delete mode 100644 kernel/gcov/gcc_3_4.c (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3941a9c48f83..feaad597b3f4 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -51,28 +51,4 @@ config GCOV_PROFILE_ALL larger and run slower. Also be sure to exclude files from profiling which are not linked to the kernel image to prevent linker errors. -choice - prompt "Specify GCOV format" - depends on GCOV_KERNEL - depends on CC_IS_GCC - ---help--- - The gcov format is usually determined by the GCC version, and the - default is chosen according to your GCC version. However, there are - exceptions where format changes are integrated in lower-version GCCs. - In such a case, change this option to adjust the format used in the - kernel accordingly. - -config GCOV_FORMAT_3_4 - bool "GCC 3.4 format" - depends on GCC_VERSION < 40700 - ---help--- - Select this option to use the format defined by GCC 3.4. - -config GCOV_FORMAT_4_7 - bool "GCC 4.7 format" - ---help--- - Select this option to use the format defined by GCC 4.7. - -endchoice - endmenu diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index d66a74b0f100..16f8ecc7d882 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,6 +2,5 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c deleted file mode 100644 index acb83558e5df..000000000000 --- a/kernel/gcov/gcc_3_4.c +++ /dev/null @@ -1,573 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code provides functions to handle gcc's profiling data format - * introduced with gcc 3.4. Future versions of gcc may change the gcov - * format (as happened before), so all format-specific information needs - * to be kept modular and easily exchangeable. - * - * This file is based on gcc-internal definitions. Functions and data - * structures are defined to be compatible with gcc counterparts. - * For a better understanding, refer to gcc source: gcc/gcov-io.h. - * - * Copyright IBM Corp. 2009 - * Author(s): Peter Oberparleiter - * - * Uses gcc-internal data definitions. - */ - -#include -#include -#include -#include -#include -#include "gcov.h" - -#define GCOV_COUNTERS 5 - -static struct gcov_info *gcov_info_head; - -/** - * struct gcov_fn_info - profiling meta data per function - * @ident: object file-unique function identifier - * @checksum: function checksum - * @n_ctrs: number of values per counter type belonging to this function - * - * This data is generated by gcc during compilation and doesn't change - * at run-time. - */ -struct gcov_fn_info { - unsigned int ident; - unsigned int checksum; - unsigned int n_ctrs[]; -}; - -/** - * struct gcov_ctr_info - profiling data per counter type - * @num: number of counter values for this type - * @values: array of counter values for this type - * @merge: merge function for counter values of this type (unused) - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the values array. - */ -struct gcov_ctr_info { - unsigned int num; - gcov_type *values; - void (*merge)(gcov_type *, unsigned int); -}; - -/** - * struct gcov_info - profiling data per object file - * @version: gcov version magic indicating the gcc version used for compilation - * @next: list head for a singly-linked list - * @stamp: time stamp - * @filename: name of the associated gcov data file - * @n_functions: number of instrumented functions - * @functions: function data - * @ctr_mask: mask specifying which counter types are active - * @counts: counter data per counter type - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the next pointer. - */ -struct gcov_info { - unsigned int version; - struct gcov_info *next; - unsigned int stamp; - const char *filename; - unsigned int n_functions; - const struct gcov_fn_info *functions; - unsigned int ctr_mask; - struct gcov_ctr_info counts[]; -}; - -/** - * gcov_info_filename - return info filename - * @info: profiling data set - */ -const char *gcov_info_filename(struct gcov_info *info) -{ - return info->filename; -} - -/** - * gcov_info_version - return info version - * @info: profiling data set - */ -unsigned int gcov_info_version(struct gcov_info *info) -{ - return info->version; -} - -/** - * gcov_info_next - return next profiling data set - * @info: profiling data set - * - * Returns next gcov_info following @info or first gcov_info in the chain if - * @info is %NULL. - */ -struct gcov_info *gcov_info_next(struct gcov_info *info) -{ - if (!info) - return gcov_info_head; - - return info->next; -} - -/** - * gcov_info_link - link/add profiling data set to the list - * @info: profiling data set - */ -void gcov_info_link(struct gcov_info *info) -{ - info->next = gcov_info_head; - gcov_info_head = info; -} - -/** - * gcov_info_unlink - unlink/remove profiling data set from the list - * @prev: previous profiling data set - * @info: profiling data set - */ -void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) -{ - if (prev) - prev->next = info->next; - else - gcov_info_head = info->next; -} - -/** - * gcov_info_within_module - check if a profiling data set belongs to a module - * @info: profiling data set - * @mod: module - * - * Returns true if profiling data belongs module, false otherwise. - */ -bool gcov_info_within_module(struct gcov_info *info, struct module *mod) -{ - return within_module((unsigned long)info, mod); -} - -/* Symbolic links to be created for each profiling data file. */ -const struct gcov_link gcov_link[] = { - { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ - { 0, NULL}, -}; - -/* - * Determine whether a counter is active. Based on gcc magic. Doesn't change - * at run-time. - */ -static int counter_active(struct gcov_info *info, unsigned int type) -{ - return (1 << type) & info->ctr_mask; -} - -/* Determine number of active counters. Based on gcc magic. */ -static unsigned int num_counter_active(struct gcov_info *info) -{ - unsigned int i; - unsigned int result = 0; - - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(info, i)) - result++; - } - return result; -} - -/** - * gcov_info_reset - reset profiling data to zero - * @info: profiling data set - */ -void gcov_info_reset(struct gcov_info *info) -{ - unsigned int active = num_counter_active(info); - unsigned int i; - - for (i = 0; i < active; i++) { - memset(info->counts[i].values, 0, - info->counts[i].num * sizeof(gcov_type)); - } -} - -/** - * gcov_info_is_compatible - check if profiling data can be added - * @info1: first profiling data set - * @info2: second profiling data set - * - * Returns non-zero if profiling data can be added, zero otherwise. - */ -int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) -{ - return (info1->stamp == info2->stamp); -} - -/** - * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added - * - * Adds profiling counts of @source to @dest. - */ -void gcov_info_add(struct gcov_info *dest, struct gcov_info *source) -{ - unsigned int i; - unsigned int j; - - for (i = 0; i < num_counter_active(dest); i++) { - for (j = 0; j < dest->counts[i].num; j++) { - dest->counts[i].values[j] += - source->counts[i].values[j]; - } - } -} - -/* Get size of function info entry. Based on gcc magic. */ -static size_t get_fn_size(struct gcov_info *info) -{ - size_t size; - - size = sizeof(struct gcov_fn_info) + num_counter_active(info) * - sizeof(unsigned int); - if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int)) - size = ALIGN(size, __alignof__(struct gcov_fn_info)); - return size; -} - -/* Get address of function info entry. Based on gcc magic. */ -static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn) -{ - return (struct gcov_fn_info *) - ((char *) info->functions + fn * get_fn_size(info)); -} - -/** - * gcov_info_dup - duplicate profiling data set - * @info: profiling data set to duplicate - * - * Return newly allocated duplicate on success, %NULL on error. - */ -struct gcov_info *gcov_info_dup(struct gcov_info *info) -{ - struct gcov_info *dup; - unsigned int i; - unsigned int active; - - /* Duplicate gcov_info. */ - active = num_counter_active(info); - dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL); - if (!dup) - return NULL; - dup->version = info->version; - dup->stamp = info->stamp; - dup->n_functions = info->n_functions; - dup->ctr_mask = info->ctr_mask; - /* Duplicate filename. */ - dup->filename = kstrdup(info->filename, GFP_KERNEL); - if (!dup->filename) - goto err_free; - /* Duplicate table of functions. */ - dup->functions = kmemdup(info->functions, info->n_functions * - get_fn_size(info), GFP_KERNEL); - if (!dup->functions) - goto err_free; - /* Duplicate counter arrays. */ - for (i = 0; i < active ; i++) { - struct gcov_ctr_info *ctr = &info->counts[i]; - size_t size = ctr->num * sizeof(gcov_type); - - dup->counts[i].num = ctr->num; - dup->counts[i].merge = ctr->merge; - dup->counts[i].values = vmalloc(size); - if (!dup->counts[i].values) - goto err_free; - memcpy(dup->counts[i].values, ctr->values, size); - } - return dup; - -err_free: - gcov_info_free(dup); - return NULL; -} - -/** - * gcov_info_free - release memory for profiling data set duplicate - * @info: profiling data set duplicate to free - */ -void gcov_info_free(struct gcov_info *info) -{ - unsigned int active = num_counter_active(info); - unsigned int i; - - for (i = 0; i < active ; i++) - vfree(info->counts[i].values); - kfree(info->functions); - kfree(info->filename); - kfree(info); -} - -/** - * struct type_info - iterator helper array - * @ctr_type: counter type - * @offset: index of the first value of the current function for this type - * - * This array is needed to convert the in-memory data format into the in-file - * data format: - * - * In-memory: - * for each counter type - * for each function - * values - * - * In-file: - * for each function - * for each counter type - * values - * - * See gcc source gcc/gcov-io.h for more information on data organization. - */ -struct type_info { - int ctr_type; - unsigned int offset; -}; - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @record: record type - * @function: function number - * @type: counter type - * @count: index into values array - * @num_types: number of counter types - * @type_info: helper array to get values-array offset for current function - */ -struct gcov_iterator { - struct gcov_info *info; - - int record; - unsigned int function; - unsigned int type; - unsigned int count; - - int num_types; - struct type_info type_info[]; -}; - -static struct gcov_fn_info *get_func(struct gcov_iterator *iter) -{ - return get_fn_info(iter->info, iter->function); -} - -static struct type_info *get_type(struct gcov_iterator *iter) -{ - return &iter->type_info[iter->type]; -} - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)), - GFP_KERNEL); - if (iter) - iter->info = info; - - return iter; -} - -/** - * gcov_iter_free - release memory for iterator - * @iter: file iterator to free - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - int i; - - iter->record = 0; - iter->function = 0; - iter->type = 0; - iter->count = 0; - iter->num_types = 0; - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(iter->info, i)) { - iter->type_info[iter->num_types].ctr_type = i; - iter->type_info[iter->num_types++].offset = 0; - } - } -} - -/* Mapping of logical record number to actual file content. */ -#define RECORD_FILE_MAGIC 0 -#define RECORD_GCOV_VERSION 1 -#define RECORD_TIME_STAMP 2 -#define RECORD_FUNCTION_TAG 3 -#define RECORD_FUNCTON_TAG_LEN 4 -#define RECORD_FUNCTION_IDENT 5 -#define RECORD_FUNCTION_CHECK 6 -#define RECORD_COUNT_TAG 7 -#define RECORD_COUNT_LEN 8 -#define RECORD_COUNT 9 - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - switch (iter->record) { - case RECORD_FILE_MAGIC: - case RECORD_GCOV_VERSION: - case RECORD_FUNCTION_TAG: - case RECORD_FUNCTON_TAG_LEN: - case RECORD_FUNCTION_IDENT: - case RECORD_COUNT_TAG: - /* Advance to next record */ - iter->record++; - break; - case RECORD_COUNT: - /* Advance to next count */ - iter->count++; - /* fall through */ - case RECORD_COUNT_LEN: - if (iter->count < get_func(iter)->n_ctrs[iter->type]) { - iter->record = 9; - break; - } - /* Advance to next counter type */ - get_type(iter)->offset += iter->count; - iter->count = 0; - iter->type++; - /* fall through */ - case RECORD_FUNCTION_CHECK: - if (iter->type < iter->num_types) { - iter->record = 7; - break; - } - /* Advance to next function */ - iter->type = 0; - iter->function++; - /* fall through */ - case RECORD_TIME_STAMP: - if (iter->function < iter->info->n_functions) - iter->record = 3; - else - iter->record = -1; - break; - } - /* Check for EOF. */ - if (iter->record == -1) - return -EINVAL; - else - return 0; -} - -/** - * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file - * @seq: seq_file handle - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. - */ -static int seq_write_gcov_u32(struct seq_file *seq, u32 v) -{ - return seq_write(seq, &v, sizeof(v)); -} - -/** - * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file - * @seq: seq_file handle - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. - */ -static int seq_write_gcov_u64(struct seq_file *seq, u64 v) -{ - u32 data[2]; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - return seq_write(seq, data, sizeof(data)); -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - int rc = -EINVAL; - - switch (iter->record) { - case RECORD_FILE_MAGIC: - rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC); - break; - case RECORD_GCOV_VERSION: - rc = seq_write_gcov_u32(seq, iter->info->version); - break; - case RECORD_TIME_STAMP: - rc = seq_write_gcov_u32(seq, iter->info->stamp); - break; - case RECORD_FUNCTION_TAG: - rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); - break; - case RECORD_FUNCTON_TAG_LEN: - rc = seq_write_gcov_u32(seq, 2); - break; - case RECORD_FUNCTION_IDENT: - rc = seq_write_gcov_u32(seq, get_func(iter)->ident); - break; - case RECORD_FUNCTION_CHECK: - rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); - break; - case RECORD_COUNT_TAG: - rc = seq_write_gcov_u32(seq, - GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type)); - break; - case RECORD_COUNT_LEN: - rc = seq_write_gcov_u32(seq, - get_func(iter)->n_ctrs[iter->type] * 2); - break; - case RECORD_COUNT: - rc = seq_write_gcov_u64(seq, - iter->info->counts[iter->type]. - values[iter->count + get_type(iter)->offset]); - break; - } - return rc; -} -- cgit v1.2.3 From 18aa18566218d4a46d940049b835314d2b071cc2 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:46:24 +0300 Subject: perf/core: Open access to the core for CAP_PERFMON privileged process Open access to monitoring of kernel code, CPUs, tracepoints and namespaces data for a CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons the access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: linux-man@vger.kernel.org Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/471acaef-bb8a-5ce2-923f-90606b78eef9@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..74025b7b83a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11504,7 +11504,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } -- cgit v1.2.3 From c9e0924e5c2b59365f9c0d43ff8722e79ecf4088 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:47:01 +0300 Subject: perf/core: open access to probes for CAP_PERFMON privileged process Open access to monitoring via kprobes and uprobes and eBPF tracing for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. perf kprobes and uprobes are used by ftrace and eBPF. perf probe uses ftrace to define new kprobe events, and those events are treated as tracepoint events. eBPF defines new probes via perf_event_open interface and then the probes are used in eBPF tracing. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Cc: linux-man@vger.kernel.org Link: http://lore.kernel.org/lkml/3c129d9a-ba8a-3483-ecc5-ad6c8e7c203f@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 74025b7b83a0..52951e9e8e1b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9397,7 +9397,7 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9457,7 +9457,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* -- cgit v1.2.3 From 031258da05956646c5606023ab0abe10a7e68ea1 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:48:54 +0300 Subject: trace/bpf_trace: Open access for CAP_PERFMON privileged process Open access to bpf_trace monitoring for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to bpf_trace monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure bpf_trace monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Acked-by: Song Liu Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/c0a0ae47-8b6e-ff3e-416b-3cd1faaf71c0@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..d7d88007dc6d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1468,7 +1468,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; -- cgit v1.2.3 From db991af02f11053558431467102ee5832894d7a4 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 8 Apr 2020 16:31:06 +0200 Subject: module: break nested ARCH_HAS_STRICT_MODULE_RWX and STRICT_MODULE_RWX #ifdefs Various frob_* and module_{enable,disable}_* functions are defined in a CONFIG_ARCH_HAS_STRICT_MODULE_RWX ifdef block which also has a nested CONFIG_STRICT_MODULE_RWX ifdef block within it. This is unecessary and makes things hard to read. Not only that, this construction requires redundant empty stubs for module_enable_nx(). I suspect this was originally done for cosmetic reasons - to keep all the frob_* functions in the same place, and all the module_{enable,disable}_* functions right after, but as a result it made the code harder to read. Make this more readable by unnesting the ifdef blocks and getting rid of the redundant empty stubs. Signed-off-by: Jessica Yu --- kernel/module.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..01d01a489778 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1943,7 +1943,6 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. @@ -1957,6 +1956,14 @@ static void mod_sysfs_teardown(struct module *mod) * * These values are always page-aligned (as is base) */ + +/* + * Since some arches are moving towards PAGE_KERNEL module allocations instead + * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the + * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of + * whether we are strict. + */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX static void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { @@ -1966,6 +1973,15 @@ static void frob_text(const struct module_layout *layout, layout->text_size >> PAGE_SHIFT); } +static void module_enable_x(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} +#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ +static void module_enable_x(const struct module *mod) { } +#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ + #ifdef CONFIG_STRICT_MODULE_RWX static void frob_rodata(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) @@ -2037,18 +2053,9 @@ static void module_enable_nx(const struct module *mod) } #else /* !CONFIG_STRICT_MODULE_RWX */ +/* module_{enable,disable}_ro() stubs are in module.h */ static void module_enable_nx(const struct module *mod) { } #endif /* CONFIG_STRICT_MODULE_RWX */ -static void module_enable_x(const struct module *mod) -{ - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); -} -#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -static void module_enable_nx(const struct module *mod) { } -static void module_enable_x(const struct module *mod) { } -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ - #ifdef CONFIG_LIVEPATCH /* -- cgit v1.2.3 From 58eb7b77ad01f058e523554cb7bae7272a7d2579 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Sat, 18 Apr 2020 00:24:51 +0800 Subject: smp: Use smp_call_func_t in on_each_cpu() Use smp_call_func_t instead of the open coded function pointer argument. Signed-off-by: Kaitao Cheng Signed-off-by: Thomas Gleixner Acked-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20200417162451.91969-1-pilgrimtao@gmail.com --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 786092aabdcd..84303197caf9 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -620,7 +620,7 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -void on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; -- cgit v1.2.3 From 05f099a7d0a73114c6eb3e6a359ea97563b47031 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 17 Apr 2020 15:36:05 +0800 Subject: dma-debug: make __dma_entry_alloc_check_leak() static Fix the following sparse warning: kernel/dma/debug.c:659:6: warning: symbol '__dma_entry_alloc_check_leak' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 9e1777c81f55..36c962a86bf2 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -656,7 +656,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -void __dma_entry_alloc_check_leak(void) +static void __dma_entry_alloc_check_leak(void) { u32 tmp = nr_total_entries % nr_prealloc_entries; -- cgit v1.2.3 From e860c299ac0d738b44ff91693f11e63080a29698 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:52 -0700 Subject: dma-remap: separate DMA atomic pools from direct remap code DMA atomic pools will be needed beyond only CONFIG_DMA_DIRECT_REMAP so separate them out into their own file. This also adds a new Kconfig option that can be subsequently used for options, such as CONFIG_AMD_MEM_ENCRYPT, that will utilize the coherent pools but do not have a dependency on direct remapping. For this patch alone, there is no functional change introduced. Reviewed-by: Christoph Hellwig Signed-off-by: David Rientjes [hch: fixup copyrights and remove unused includes] Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 6 ++- kernel/dma/Makefile | 1 + kernel/dma/pool.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/remap.c | 121 +-------------------------------------------------- 4 files changed, 130 insertions(+), 121 deletions(-) create mode 100644 kernel/dma/pool.c (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 4c103a24e380..d006668c0027 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -79,10 +79,14 @@ config DMA_REMAP select DMA_NONCOHERENT_MMAP bool -config DMA_DIRECT_REMAP +config DMA_COHERENT_POOL bool select DMA_REMAP +config DMA_DIRECT_REMAP + bool + select DMA_COHERENT_POOL + config DMA_CMA bool "DMA Contiguous Memory Allocator" depends on HAVE_DMA_CONTIGUOUS && CMA diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index d237cf3dc181..370f63344e9c 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_DMA_REMAP) += remap.o diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c new file mode 100644 index 000000000000..3df5d9d39922 --- /dev/null +++ b/kernel/dma/pool.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 ARM Ltd. + * Copyright (C) 2020 Google LLC + */ +#include +#include +#include +#include +#include +#include + +static struct gen_pool *atomic_pool __ro_after_init; + +#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K +static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; + +static int __init early_coherent_pool(char *p) +{ + atomic_pool_size = memparse(p, &p); + return 0; +} +early_param("coherent_pool", early_coherent_pool); + +static gfp_t dma_atomic_pool_gfp(void) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + return GFP_DMA; + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + return GFP_DMA32; + return GFP_KERNEL; +} + +static int __init dma_atomic_pool_init(void) +{ + unsigned int pool_size_order = get_order(atomic_pool_size); + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + struct page *page; + void *addr; + int ret; + + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, nr_pages, + pool_size_order, false); + else + page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); + if (!page) + goto out; + + arch_dma_prep_coherent(page, atomic_pool_size); + + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); + if (!atomic_pool) + goto free_page; + + addr = dma_common_contiguous_remap(page, atomic_pool_size, + pgprot_dmacoherent(PAGE_KERNEL), + __builtin_return_address(0)); + if (!addr) + goto destroy_genpool; + + ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, + page_to_phys(page), atomic_pool_size, -1); + if (ret) + goto remove_mapping; + gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); + + pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", + atomic_pool_size / 1024); + return 0; + +remove_mapping: + dma_common_free_remap(addr, atomic_pool_size); +destroy_genpool: + gen_pool_destroy(atomic_pool); + atomic_pool = NULL; +free_page: + if (!dma_release_from_contiguous(NULL, page, nr_pages)) + __free_pages(page, pool_size_order); +out: + pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", + atomic_pool_size / 1024); + return -ENOMEM; +} +postcore_initcall(dma_atomic_pool_init); + +bool dma_in_atomic_pool(void *start, size_t size) +{ + if (unlikely(!atomic_pool)) + return false; + + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); +} + +void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) +{ + unsigned long val; + void *ptr = NULL; + + if (!atomic_pool) { + WARN(1, "coherent pool not initialised!\n"); + return NULL; + } + + val = gen_pool_alloc(atomic_pool, size); + if (val) { + phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + + *ret_page = pfn_to_page(__phys_to_pfn(phys)); + ptr = (void *)val; + memset(ptr, 0, size); + } + + return ptr; +} + +bool dma_free_from_pool(void *start, size_t size) +{ + if (!dma_in_atomic_pool(start, size)) + return false; + gen_pool_free(atomic_pool, (unsigned long)start, size); + return true; +} diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..f7b402849891 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -1,13 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2012 ARM Ltd. * Copyright (c) 2014 The Linux Foundation */ -#include -#include -#include -#include -#include +#include #include #include @@ -97,117 +92,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size) unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } - -#ifdef CONFIG_DMA_DIRECT_REMAP -static struct gen_pool *atomic_pool __ro_after_init; - -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; - -static int __init early_coherent_pool(char *p) -{ - atomic_pool_size = memparse(p, &p); - return 0; -} -early_param("coherent_pool", early_coherent_pool); - -static gfp_t dma_atomic_pool_gfp(void) -{ - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; - struct page *page; - void *addr; - int ret; - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); - else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); - if (!page) - goto out; - - arch_dma_prep_coherent(page, atomic_pool_size); - - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) - goto free_page; - - addr = dma_common_contiguous_remap(page, atomic_pool_size, - pgprot_dmacoherent(PAGE_KERNEL), - __builtin_return_address(0)); - if (!addr) - goto destroy_genpool; - - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); - if (ret) - goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); - - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); - return 0; - -remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); -destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; -free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); -out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); - return -ENOMEM; -} -postcore_initcall(dma_atomic_pool_init); - -bool dma_in_atomic_pool(void *start, size_t size) -{ - if (unlikely(!atomic_pool)) - return false; - - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); -} - -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) -{ - unsigned long val; - void *ptr = NULL; - - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); - return NULL; - } - - val = gen_pool_alloc(atomic_pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); - - *ret_page = pfn_to_page(__phys_to_pfn(phys)); - ptr = (void *)val; - memset(ptr, 0, size); - } - - return ptr; -} - -bool dma_free_from_pool(void *start, size_t size) -{ - if (!dma_in_atomic_pool(start, size)) - return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); - return true; -} -#endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit v1.2.3 From c84dc6e68a1d2464e050d9694be4e4ff49e32bfd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:55 -0700 Subject: dma-pool: add additional coherent pools to map to gfp mask The single atomic pool is allocated from the lowest zone possible since it is guaranteed to be applicable for any DMA allocation. Devices may allocate through the DMA API but not have a strict reliance on GFP_DMA memory. Since the atomic pool will be used for all non-blockable allocations, returning all memory from ZONE_DMA may unnecessarily deplete the zone. Provision for multiple atomic pools that will map to the optimal gfp mask of the device. When allocating non-blockable memory, determine the optimal gfp mask of the device and use the appropriate atomic pool. The coherent DMA mask will remain the same between allocation and free and, thus, memory will be freed to the same atomic pool it was allocated from. __dma_atomic_pool_init() will be changed to return struct gen_pool * later once dynamic expansion is added. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 12 +++--- kernel/dma/pool.c | 120 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 83 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f4bbdaf965e..a834ee22f8ff 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -45,8 +45,8 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_limit) +gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); @@ -89,8 +89,8 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_limit); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_limit); page = dma_alloc_contiguous(dev, alloc_size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, alloc_size); @@ -128,7 +128,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs) && !gfpflags_allow_blocking(gfp)) { - ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); if (!ret) return NULL; goto done; @@ -212,7 +212,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, } if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; if (force_dma_unencrypted(dev)) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 3df5d9d39922..db4f89ac5f5f 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -10,7 +10,9 @@ #include #include -static struct gen_pool *atomic_pool __ro_after_init; +static struct gen_pool *atomic_pool_dma __ro_after_init; +static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static struct gen_pool *atomic_pool_kernel __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -22,89 +24,119 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -static gfp_t dma_atomic_pool_gfp(void) +static int __init __dma_atomic_pool_init(struct gen_pool **pool, + size_t pool_size, gfp_t gfp) { - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + const unsigned int order = get_order(pool_size); + const unsigned long nr_pages = pool_size >> PAGE_SHIFT; struct page *page; void *addr; int ret; if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); + page = dma_alloc_from_contiguous(NULL, nr_pages, order, false); else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); + page = alloc_pages(gfp, order); if (!page) goto out; - arch_dma_prep_coherent(page, atomic_pool_size); + arch_dma_prep_coherent(page, pool_size); - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) + *pool = gen_pool_create(PAGE_SHIFT, -1); + if (!*pool) goto free_page; - addr = dma_common_contiguous_remap(page, atomic_pool_size, + addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) goto destroy_genpool; - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); + ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page), + pool_size, -1); if (ret) goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); + gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL); - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + pool_size >> 10, &gfp); return 0; remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); + dma_common_free_remap(addr, pool_size); destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; + gen_pool_destroy(*pool); + *pool = NULL; free_page: if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); + __free_pages(page, order); out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); return -ENOMEM; } + +static int __init dma_atomic_pool_init(void) +{ + int ret = 0; + int err; + + ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, + GFP_KERNEL); + if (IS_ENABLED(CONFIG_ZONE_DMA)) { + err = __dma_atomic_pool_init(&atomic_pool_dma, + atomic_pool_size, GFP_DMA); + if (!ret && err) + ret = err; + } + if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + err = __dma_atomic_pool_init(&atomic_pool_dma32, + atomic_pool_size, GFP_DMA32); + if (!ret && err) + ret = err; + } + return ret; +} postcore_initcall(dma_atomic_pool_init); -bool dma_in_atomic_pool(void *start, size_t size) +static inline struct gen_pool *dev_to_pool(struct device *dev) { - if (unlikely(!atomic_pool)) - return false; + u64 phys_mask; + gfp_t gfp; + + gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + return atomic_pool_dma; + if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) + return atomic_pool_dma32; + return atomic_pool_kernel; +} - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); +static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) +{ + struct gen_pool *pool = dev_to_pool(dev); + + if (unlikely(!pool)) + return false; + return gen_pool_has_addr(pool, (unsigned long)start, size); } -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) +void *dma_alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page, gfp_t flags) { + struct gen_pool *pool = dev_to_pool(dev); unsigned long val; void *ptr = NULL; - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); + if (!pool) { + WARN(1, "%pGg atomic pool not initialised!\n", &flags); return NULL; } - val = gen_pool_alloc(atomic_pool, size); + val = gen_pool_alloc(pool, size); if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + phys_addr_t phys = gen_pool_virt_to_phys(pool, val); *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; @@ -114,10 +146,12 @@ void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) return ptr; } -bool dma_free_from_pool(void *start, size_t size) +bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - if (!dma_in_atomic_pool(start, size)) + struct gen_pool *pool = dev_to_pool(dev); + + if (!dma_in_atomic_pool(dev, start, size)) return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); + gen_pool_free(pool, (unsigned long)start, size); return true; } -- cgit v1.2.3 From 0c1bc6b84525b96aa9fb8f6fbe8c5cb26a5c0ead Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:37 +0200 Subject: docs: filesystems: fix renamed references Some filesystem references got broken by a previous patch series I submitted. Address those. Signed-off-by: Mauro Carvalho Chehab Acked-by: David Sterba # fs/affs/Kconfig Link: https://lore.kernel.org/r/57318c53008dbda7f6f4a5a9e5787f4d37e8565a.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..d0c9c287680a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relay.txt for an overview. + * See Documentation/filesystems/relay.rst for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) -- cgit v1.2.3 From 03c109d66867767ddbb0fe09223410a79193f5ea Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:58 +0200 Subject: futex: get rid of a kernel-docs build warning Adjust whitespaces and blank lines in order to get rid of this: ./kernel/futex.c:491: WARNING: Definition list ends without a blank line; unexpected unindent. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/57788af7889161483e0c97f91c079cfb3986c4b3.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- kernel/futex.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b59532862bc0..b4b9f960b610 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -486,10 +486,13 @@ static u64 get_inode_sequence_number(struct inode *inode) * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: + * * ( inode->i_sequence, page->index, offset_within_page ) + * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: + * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex -- cgit v1.2.3 From a48b284b403a4a073d8beb72d2bb33e54df67fb6 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 20 Apr 2020 10:09:29 -0400 Subject: audit: fix a net reference leak in audit_send_reply() If audit_send_reply() fails when trying to create a new thread to send the reply it also fails to cleanup properly, leaking a reference to a net structure. This patch fixes the error path and makes a handful of other cleanups that came up while fixing the code. Reported-by: teroincn@gmail.com Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b69c8b460341..66b81358b64f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -924,19 +924,30 @@ out_kfree_skb: return NULL; } +static void audit_free_reply(struct audit_reply *reply) +{ + if (!reply) + return; + + if (reply->skb) + kfree_skb(reply->skb); + if (reply->net) + put_net(reply->net); + kfree(reply); +} + static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct sock *sk = audit_get_sk(reply->net); audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ - netlink_unicast(sk, reply->skb, reply->portid, 0); - put_net(reply->net); - kfree(reply); + netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); + reply->skb = NULL; + audit_free_reply(reply); return 0; } @@ -950,35 +961,32 @@ static int audit_send_reply_thread(void *arg) * @payload: payload data * @size: payload size * - * Allocates an skb, builds the netlink message, and sends it to the port id. - * No failure notifications. + * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { - struct net *net = sock_net(NETLINK_CB(request_skb).sk); - struct sk_buff *skb; struct task_struct *tsk; - struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), - GFP_KERNEL); + struct audit_reply *reply; + reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; - skb = audit_make_reply(seq, type, done, multi, payload, size); - if (!skb) - goto out; - - reply->net = get_net(net); + reply->skb = audit_make_reply(seq, type, done, multi, payload, size); + if (!reply->skb) + goto err; + reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; - reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (!IS_ERR(tsk)) - return; - kfree_skb(skb); -out: - kfree(reply); + if (IS_ERR(tsk)) + goto err; + + return; + +err: + audit_free_reply(reply); } /* -- cgit v1.2.3 From 5c3a7db0c7ec4bbd5bd3f48af9be859a8fa3e532 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Apr 2020 19:13:03 +0200 Subject: module: Harden STRICT_MODULE_RWX We're very close to enforcing W^X memory, refuse to load modules that violate this principle per construction. [jeyu: move module_enforce_rwx_sections under STRICT_MODULE_RWX as per discussion] Link: http://lore.kernel.org/r/20200403171303.GK20760@hirez.programming.kicks-ass.net Acked-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Jessica Yu --- kernel/module.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 01d01a489778..70fc20583e66 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2052,9 +2052,28 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; + int i; + + for (i = 0; i < hdr->e_shnum; i++) { + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) + return -ENOEXEC; + } + + return 0; +} + #else /* !CONFIG_STRICT_MODULE_RWX */ /* module_{enable,disable}_ro() stubs are in module.h */ static void module_enable_nx(const struct module *mod) { } +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + return 0; +} #endif /* CONFIG_STRICT_MODULE_RWX */ #ifdef CONFIG_LIVEPATCH @@ -3385,6 +3404,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (err < 0) return ERR_PTR(err); + err = module_enforce_rwx_sections(info->hdr, info->sechdrs, + info->secstrings, info->mod); + if (err < 0) + return ERR_PTR(err); + /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; -- cgit v1.2.3 From 51161bfc66a68d21f13d15a689b3ea7980457790 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 18:55:06 +0300 Subject: kernel/module: Hide vermagic header file from general use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VERMAGIC* definitions are not supposed to be used by the drivers, see this [1] bug report, so introduce special define to guard inclusion of this header file and define it in kernel/modules.h and in internal script that generates *.mod.c files. In-tree module build: âžœ kernel git:(vermagic) ✗ make clean âžœ kernel git:(vermagic) ✗ make M=drivers/infiniband/hw/mlx5 âžœ kernel git:(vermagic) ✗ modinfo drivers/infiniband/hw/mlx5/mlx5_ib.ko filename: /images/leonro/src/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.ko <...> vermagic: 5.6.0+ SMP mod_unload modversions Out-of-tree module build: âžœ mlx5 make -C /images/leonro/src/kernel clean M=/tmp/mlx5 âžœ mlx5 make -C /images/leonro/src/kernel M=/tmp/mlx5 âžœ mlx5 modinfo /tmp/mlx5/mlx5_ib.ko filename: /tmp/mlx5/mlx5_ib.ko <...> vermagic: 5.6.0+ SMP mod_unload modversions [1] https://lore.kernel.org/lkml/20200411155623.GA22175@zn.tnic Reported-by: Borislav Petkov Acked-by: Borislav Petkov Acked-by: Jessica Yu Co-developed-by: Masahiro Yamada Signed-off-by: Masahiro Yamada Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller --- kernel/module.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..8833e848b73c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4,6 +4,9 @@ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. */ + +#define INCLUDE_VERMAGIC + #include #include #include -- cgit v1.2.3 From 3054d06719079388a543de6adb812638675ad8f5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 21 Apr 2020 09:10:56 -0400 Subject: audit: fix a net reference leak in audit_list_rules_send() If audit_list_rules_send() fails when trying to create a new thread to send the rules it also fails to cleanup properly, leaking a reference to a net structure. This patch fixes the error patch and renames audit_send_list() to audit_send_list_thread() to better match its cousin, audit_send_reply_thread(). Reported-by: teroincn@gmail.com Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/audit.h | 2 +- kernel/auditfilter.c | 16 +++++++--------- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 66b81358b64f..622c30246d19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -880,7 +880,7 @@ main_queue: return 0; } -int audit_send_list(void *_dest) +int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; diff --git a/kernel/audit.h b/kernel/audit.h index 2eed4d231624..f0233dc40b17 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -229,7 +229,7 @@ struct audit_netlink_list { struct sk_buff_head q; }; -int audit_send_list(void *_dest); +int audit_send_list_thread(void *_dest); extern int selinux_audit_rule_update(void); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 026e34da4ace..a10e2997aa6c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1161,11 +1161,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) { - u32 portid = NETLINK_CB(request_skb).portid; - struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; - int err = 0; /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for @@ -1173,25 +1170,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq) * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); + dest = kmalloc(sizeof(*dest), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(net); - dest->portid = portid; + dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); + dest->portid = NETLINK_CB(request_skb).portid; skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); - tsk = kthread_run(audit_send_list, dest, "audit_send_list"); + tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list"); if (IS_ERR(tsk)) { skb_queue_purge(&dest->q); + put_net(dest->net); kfree(dest); - err = PTR_ERR(tsk); + return PTR_ERR(tsk); } - return err; + return 0; } int audit_comparator(u32 left, u32 op, u32 right) -- cgit v1.2.3 From 788f87ac608c518b74f338acb95f197cf6e3d0c4 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 22 Apr 2020 15:05:09 +0300 Subject: xdp: export the DEV_MAP_BULK_SIZE macro Export the DEV_MAP_BULK_SIZE macro to the header file so that drivers can directly use it as the maximum number of xdp_frames received in the .ndo_xdp_xmit() callback. Signed-off-by: Ioana Ciornei Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 58bdca5d978a..a51d9fb7a359 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -52,7 +52,6 @@ #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -#define DEV_MAP_BULK_SIZE 16 struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; -- cgit v1.2.3 From 54adadf9b08571fb8b11dc9d0d3a2ddd39825efd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 20 Apr 2020 12:09:58 +0200 Subject: dma-pool: dynamically expanding atomic pools When an atomic pool becomes fully depleted because it is now relied upon for all non-blocking allocations through the DMA API, allow background expansion of each pool by a kworker. When an atomic pool has less than the default size of memory left, kick off a kworker to dynamically expand the pool in the background. The pool is doubled in size, up to MAX_ORDER-1. If memory cannot be allocated at the requested order, smaller allocation(s) are attempted. This allows the default size to be kept quite low when one or more of the atomic pools is not used. Allocations for lowmem should also use GFP_KERNEL for the benefits of reclaim, so use GFP_KERNEL | GFP_DMA and GFP_KERNEL | GFP_DMA32 for lowmem allocations. This also allows __dma_atomic_pool_init() to return a pointer to the pool to make initialization cleaner. Also switch over some node ids to the more appropriate NUMA_NO_NODE. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 122 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 84 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index db4f89ac5f5f..ffe866c2c034 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -9,13 +9,17 @@ #include #include #include +#include static struct gen_pool *atomic_pool_dma __ro_after_init; static struct gen_pool *atomic_pool_dma32 __ro_after_init; static struct gen_pool *atomic_pool_kernel __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; +static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; + +/* Dynamic background expansion when the atomic pool is near capacity */ +static struct work_struct atomic_pool_work; static int __init early_coherent_pool(char *p) { @@ -24,76 +28,116 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -static int __init __dma_atomic_pool_init(struct gen_pool **pool, - size_t pool_size, gfp_t gfp) +static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, + gfp_t gfp) { - const unsigned int order = get_order(pool_size); - const unsigned long nr_pages = pool_size >> PAGE_SHIFT; + unsigned int order; struct page *page; void *addr; - int ret; + int ret = -ENOMEM; + + /* Cannot allocate larger than MAX_ORDER-1 */ + order = min(get_order(pool_size), MAX_ORDER-1); + + do { + pool_size = 1 << (PAGE_SHIFT + order); - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, order, false); - else - page = alloc_pages(gfp, order); + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, 1 << order, + order, false); + else + page = alloc_pages(gfp, order); + } while (!page && order-- > 0); if (!page) goto out; arch_dma_prep_coherent(page, pool_size); - *pool = gen_pool_create(PAGE_SHIFT, -1); - if (!*pool) - goto free_page; - addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) - goto destroy_genpool; + goto free_page; - ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page), - pool_size, -1); + ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page), + pool_size, NUMA_NO_NODE); if (ret) goto remove_mapping; - gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL); - pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", - pool_size >> 10, &gfp); return 0; remove_mapping: dma_common_free_remap(addr, pool_size); -destroy_genpool: - gen_pool_destroy(*pool); - *pool = NULL; free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) + if (!dma_release_from_contiguous(NULL, page, 1 << order)) __free_pages(page, order); out: - pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", - pool_size >> 10, &gfp); - return -ENOMEM; + return ret; +} + +static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp) +{ + if (pool && gen_pool_avail(pool) < atomic_pool_size) + atomic_pool_expand(pool, gen_pool_size(pool), gfp); +} + +static void atomic_pool_work_fn(struct work_struct *work) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + atomic_pool_resize(atomic_pool_dma, + GFP_KERNEL | GFP_DMA); + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + atomic_pool_resize(atomic_pool_dma32, + GFP_KERNEL | GFP_DMA32); + atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL); +} + +static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size, + gfp_t gfp) +{ + struct gen_pool *pool; + int ret; + + pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE); + if (!pool) + return NULL; + + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + ret = atomic_pool_expand(pool, pool_size, gfp); + if (ret) { + gen_pool_destroy(pool); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); + return NULL; + } + + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + gen_pool_size(pool) >> 10, &gfp); + return pool; } static int __init dma_atomic_pool_init(void) { int ret = 0; - int err; - ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, - GFP_KERNEL); + INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); + + atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL); + if (!atomic_pool_kernel) + ret = -ENOMEM; if (IS_ENABLED(CONFIG_ZONE_DMA)) { - err = __dma_atomic_pool_init(&atomic_pool_dma, - atomic_pool_size, GFP_DMA); - if (!ret && err) - ret = err; + atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA); + if (!atomic_pool_dma) + ret = -ENOMEM; } if (IS_ENABLED(CONFIG_ZONE_DMA32)) { - err = __dma_atomic_pool_init(&atomic_pool_dma32, - atomic_pool_size, GFP_DMA32); - if (!ret && err) - ret = err; + atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA32); + if (!atomic_pool_dma32) + ret = -ENOMEM; } return ret; } @@ -142,6 +186,8 @@ void *dma_alloc_from_pool(struct device *dev, size_t size, ptr = (void *)val; memset(ptr, 0, size); } + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); return ptr; } -- cgit v1.2.3 From 76a19940bd62a81148c303f3df6d0cee9ae4b509 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:58 -0700 Subject: dma-direct: atomic allocations must come from atomic coherent pools When a device requires unencrypted memory and the context does not allow blocking, memory must be returned from the atomic coherent pools. This avoids the remap when CONFIG_DMA_DIRECT_REMAP is not enabled and the config only requires CONFIG_DMA_COHERENT_POOL. This will be used for CONFIG_AMD_MEM_ENCRYPT in a subsequent patch. Keep all memory in these pools unencrypted. When set_memory_decrypted() fails, this prohibits the memory from being added. If adding memory to the genpool fails, and set_memory_encrypted() subsequently fails, there is no alternative other than leaking the memory. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 46 +++++++++++++++++++++++++++++++++++++++------- kernel/dma/pool.c | 27 ++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a834ee22f8ff..0a4881e59aa7 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -76,6 +76,39 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +/* + * Decrypting memory is allowed to block, so if this device requires + * unencrypted memory it must come from atomic pools. + */ +static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp, + unsigned long attrs) +{ + if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return false; + if (gfpflags_allow_blocking(gfp)) + return false; + if (force_dma_unencrypted(dev)) + return true; + if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return false; + if (dma_alloc_need_uncached(dev, attrs)) + return true; + return false; +} + +static inline bool dma_should_free_from_pool(struct device *dev, + unsigned long attrs) +{ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return true; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) + return false; + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return true; + return false; +} + struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { @@ -125,9 +158,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs) && - !gfpflags_allow_blocking(gfp)) { + if (dma_should_alloc_from_pool(dev, gfp, attrs)) { ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); if (!ret) return NULL; @@ -204,6 +235,11 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (dma_should_free_from_pool(dev, attrs) && + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) + return; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ @@ -211,10 +247,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) - return; - if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index ffe866c2c034..c8d61b3a7bd6 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -53,22 +54,42 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, arch_dma_prep_coherent(page, pool_size); +#ifdef CONFIG_DMA_DIRECT_REMAP addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) goto free_page; - +#else + addr = page_to_virt(page); +#endif + /* + * Memory in the atomic DMA pools must be unencrypted, the pools do not + * shrink so no re-encryption occurs in dma_direct_free_pages(). + */ + ret = set_memory_decrypted((unsigned long)page_to_virt(page), + 1 << order); + if (ret) + goto remove_mapping; ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page), pool_size, NUMA_NO_NODE); if (ret) - goto remove_mapping; + goto encrypt_mapping; return 0; +encrypt_mapping: + ret = set_memory_encrypted((unsigned long)page_to_virt(page), + 1 << order); + if (WARN_ON_ONCE(ret)) { + /* Decrypt succeeded but encrypt failed, purposely leak */ + goto out; + } remove_mapping: +#ifdef CONFIG_DMA_DIRECT_REMAP dma_common_free_remap(addr, pool_size); -free_page: +#endif +free_page: __maybe_unused if (!dma_release_from_contiguous(NULL, page, 1 << order)) __free_pages(page, order); out: -- cgit v1.2.3 From 2edc5bb3c5cc42131438460a50b7b16905c81c2a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:59 -0700 Subject: dma-pool: add pool sizes to debugfs The atomic DMA pools can dynamically expand based on non-blocking allocations that need to use it. Export the sizes of each of these pools, in bytes, through debugfs for measurement. Suggested-by: Christoph Hellwig Signed-off-by: David Rientjes [hch: remove the !CONFIG_DEBUG_FS stubs] Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index c8d61b3a7bd6..dde6de7f8e83 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -3,6 +3,7 @@ * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2020 Google LLC */ +#include #include #include #include @@ -13,8 +14,11 @@ #include static struct gen_pool *atomic_pool_dma __ro_after_init; +static unsigned long pool_size_dma; static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static unsigned long pool_size_dma32; static struct gen_pool *atomic_pool_kernel __ro_after_init; +static unsigned long pool_size_kernel; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -29,6 +33,29 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); +static void __init dma_atomic_pool_debugfs_init(void) +{ + struct dentry *root; + + root = debugfs_create_dir("dma_pools", NULL); + if (IS_ERR_OR_NULL(root)) + return; + + debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma); + debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32); + debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel); +} + +static void dma_atomic_pool_size_add(gfp_t gfp, size_t size) +{ + if (gfp & __GFP_DMA) + pool_size_dma += size; + else if (gfp & __GFP_DMA32) + pool_size_dma32 += size; + else + pool_size_kernel += size; +} + static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, gfp_t gfp) { @@ -76,6 +103,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, if (ret) goto encrypt_mapping; + dma_atomic_pool_size_add(gfp, pool_size); return 0; encrypt_mapping: @@ -160,6 +188,8 @@ static int __init dma_atomic_pool_init(void) if (!atomic_pool_dma32) ret = -ENOMEM; } + + dma_atomic_pool_debugfs_init(); return ret; } postcore_initcall(dma_atomic_pool_init); -- cgit v1.2.3 From 1d659236fb43c4d2b37af7a4309681e834e9ec9a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:05:02 -0700 Subject: dma-pool: scale the default DMA coherent pool size with memory capacity When AMD memory encryption is enabled, some devices may use more than 256KB/sec from the atomic pools. It would be more appropriate to scale the default size based on memory capacity unless the coherent_pool option is used on the kernel command line. This provides a slight optimization on initial expansion and is deemed appropriate due to the increased reliance on the atomic pools. Note that the default size of 128KB per pool will normally be larger than the single coherent pool implementation since there are now up to three coherent pools (DMA, DMA32, and kernel). Note that even prior to this patch, coherent_pool= for sizes larger than 1 << (PAGE_SHIFT + MAX_ORDER-1) can fail. With new dynamic expansion support, this would be trivially extensible to allow even larger initial sizes. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index dde6de7f8e83..35bb51c31fff 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -20,8 +20,8 @@ static unsigned long pool_size_dma32; static struct gen_pool *atomic_pool_kernel __ro_after_init; static unsigned long pool_size_kernel; -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; +/* Size can be defined by the coherent_pool command line */ +static size_t atomic_pool_size; /* Dynamic background expansion when the atomic pool is near capacity */ static struct work_struct atomic_pool_work; @@ -170,6 +170,16 @@ static int __init dma_atomic_pool_init(void) { int ret = 0; + /* + * If coherent_pool was not used on the command line, default the pool + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. + */ + if (!atomic_pool_size) { + atomic_pool_size = max(totalram_pages() >> PAGE_SHIFT, 1UL) * + SZ_128K; + atomic_pool_size = min_t(size_t, atomic_pool_size, + 1 << (PAGE_SHIFT + MAX_ORDER-1)); + } INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, -- cgit v1.2.3 From 298f3db6ee690259927b105d5ad1079563361323 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 23 Apr 2020 09:31:31 -0700 Subject: dma-contiguous: fix comment for dma_release_from_contiguous Commit 90ae409f9eb3 ("dma-direct: fix zone selection after an unaddressable CMA allocation") changed the logic in dma_release_from_contiguous to remove the normal pages fallback path, but did not update the comment. Fix that. Signed-off-by: Peter Collingbourne Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8bc6f2d670f9..15bc5026c485 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -222,8 +222,8 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, * @gfp: Allocation flags. * * This function allocates contiguous memory buffer for specified device. It - * first tries to use device specific contiguous memory area if available or - * the default global one, then tries a fallback allocation of normal pages. + * tries to use device specific contiguous memory area if available, or the + * default global one. * * Note that it byapss one-page size of allocations from the global area as * the addresses within one page are always contiguous, so there is no need -- cgit v1.2.3 From ce5155c4f8226f4816f9f7cb88149ec275c7b0a8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 18 Feb 2020 22:43:15 -0500 Subject: compat sysinfo(2): don't bother with field-by-field copyout Signed-off-by: Al Viro --- kernel/sys.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..b4a0324a0699 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2634,6 +2634,7 @@ struct compat_sysinfo { COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; + struct compat_sysinfo s_32; do_sysinfo(&s); @@ -2658,23 +2659,23 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) s.freehigh >>= bitcount; } - if (!access_ok(info, sizeof(struct compat_sysinfo)) || - __put_user(s.uptime, &info->uptime) || - __put_user(s.loads[0], &info->loads[0]) || - __put_user(s.loads[1], &info->loads[1]) || - __put_user(s.loads[2], &info->loads[2]) || - __put_user(s.totalram, &info->totalram) || - __put_user(s.freeram, &info->freeram) || - __put_user(s.sharedram, &info->sharedram) || - __put_user(s.bufferram, &info->bufferram) || - __put_user(s.totalswap, &info->totalswap) || - __put_user(s.freeswap, &info->freeswap) || - __put_user(s.procs, &info->procs) || - __put_user(s.totalhigh, &info->totalhigh) || - __put_user(s.freehigh, &info->freehigh) || - __put_user(s.mem_unit, &info->mem_unit)) + memset(&s_32, 0, sizeof(s_32)); + s_32.uptime = s.uptime; + s_32.loads[0] = s.loads[0]; + s_32.loads[1] = s.loads[1]; + s_32.loads[2] = s.loads[2]; + s_32.totalram = s.totalram; + s_32.freeram = s.freeram; + s_32.sharedram = s.sharedram; + s_32.bufferram = s.bufferram; + s_32.totalswap = s.totalswap; + s_32.freeswap = s.freeswap; + s_32.procs = s.procs; + s_32.totalhigh = s.totalhigh; + s_32.freehigh = s.freehigh; + s_32.mem_unit = s.mem_unit; + if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; - return 0; } #endif /* CONFIG_COMPAT */ -- cgit v1.2.3 From b0b3fb6759220d4fa359e9ac486859c9d422c204 Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Sat, 18 Apr 2020 09:37:35 +0800 Subject: bpf: Remove set but not used variable 'dst_known' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes gcc '-Wunused-but-set-variable' warning: kernel/bpf/verifier.c:5603:18: warning: variable ‘dst_known’ set but not used [-Wunused-but-set-variable], delete this variable. Signed-off-by: Mao Wenan Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200418013735.67882-1-maowenan@huawei.com --- kernel/bpf/verifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fa1d8245b925..15ba8bf92ca9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5609,7 +5609,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, { struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); - bool src_known, dst_known; + bool src_known; s64 smin_val, smax_val; u64 umin_val, umax_val; s32 s32_min_val, s32_max_val; @@ -5631,7 +5631,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (alu32) { src_known = tnum_subreg_is_const(src_reg.var_off); - dst_known = tnum_subreg_is_const(dst_reg->var_off); if ((src_known && (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || s32_min_val > s32_max_val || u32_min_val > u32_max_val) { @@ -5643,7 +5642,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, } } else { src_known = tnum_is_const(src_reg.var_off); - dst_known = tnum_is_const(dst_reg->var_off); if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || smin_val > smax_val || umin_val > umax_val) { -- cgit v1.2.3 From 0456ea170cd665ddbb9503be92e39f96055dd5fa Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 20 Apr 2020 10:46:10 -0700 Subject: bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT} Currently the following prog types don't fall back to bpf_base_func_proto() (instead they have cgroup_base_func_proto which has a limited set of helpers from bpf_base_func_proto): * BPF_PROG_TYPE_CGROUP_DEVICE * BPF_PROG_TYPE_CGROUP_SYSCTL * BPF_PROG_TYPE_CGROUP_SOCKOPT I don't see any specific reason why we shouldn't use bpf_base_func_proto(), every other type of program (except bpf-lirc and, understandably, tracing) use it, so let's fall back to bpf_base_func_proto for those prog types as well. This basically boils down to adding access to the following helpers: * BPF_FUNC_get_prandom_u32 * BPF_FUNC_get_smp_processor_id * BPF_FUNC_get_numa_node_id * BPF_FUNC_tail_call * BPF_FUNC_ktime_get_ns * BPF_FUNC_spin_lock (CAP_SYS_ADMIN) * BPF_FUNC_spin_unlock (CAP_SYS_ADMIN) * BPF_FUNC_jiffies64 (CAP_SYS_ADMIN) I've also added bpf_perf_event_output() because it's really handy for logging and debugging. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200420174610.77494-1-sdf@google.com --- kernel/bpf/cgroup.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..4d748c5785bc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1060,30 +1060,16 @@ static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } -- cgit v1.2.3 From 6890896bd765b0504761c61901c9804fca23bfb2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 24 Apr 2020 16:59:41 -0700 Subject: bpf: Fix missing bpf_base_func_proto in cgroup_base_func_proto for CGROUP_NET=n linux-next build bot reported compile issue [1] with one of its configs. It looks like when we have CONFIG_NET=n and CONFIG_BPF{,_SYSCALL}=y, we are missing the bpf_base_func_proto definition (from net/core/filter.c) in cgroup_base_func_proto. I'm reshuffling the code a bit to make it work. The common helpers are moved into kernel/bpf/helpers.c and the bpf_base_func_proto is exported from there. Also, bpf_get_raw_cpu_id goes into kernel/bpf/core.c akin to existing bpf_user_rnd_u32. [1] https://lore.kernel.org/linux-next/CAKH8qBsBvKHswiX1nx40LgO+BGeTmb1NX8tiTttt_0uu6T3dCA@mail.gmail.com/T/#mff8b0c083314c68c2e2ef0211cb11bc20dc13c72 Fixes: 0456ea170cd6 ("bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT}") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200424235941.58382-1-sdf@google.com --- kernel/bpf/core.c | 5 ++++ kernel/bpf/helpers.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..0cc91805069a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32) return res; } +BPF_CALL_0(bpf_get_raw_cpu_id) +{ + return raw_smp_processor_id(); +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bafc53ddd350..dbba4f41d508 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -562,3 +562,76 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; + +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = bpf_get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + default: + return NULL; + } +} -- cgit v1.2.3 From 082b57e3eb09810d357083cca5ee2df02c16aec9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Mon, 20 Apr 2020 11:47:50 -0700 Subject: net: bpf: Make bpf_ktime_get_ns() available to non GPL programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The entire implementation is in kernel/bpf/helpers.c: BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; and this was presumably marked GPL due to kernel/time/timekeeping.c: EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); and while that may make sense for kernel modules (although even that is doubtful), there is currently AFAICT no other source of time available to ebpf. Furthermore this is really just equivalent to clock_gettime(CLOCK_MONOTONIC) which is exposed to userspace (via vdso even to make it performant)... As such, I see no reason to keep the GPL restriction. (In the future I'd like to have access to time from Apache licensed ebpf code) Signed-off-by: Maciej Å»enczykowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dbba4f41d508..9a6b23387d02 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -151,7 +151,7 @@ BPF_CALL_0(bpf_ktime_get_ns) const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, - .gpl_only = true, + .gpl_only = false, .ret_type = RET_INTEGER, }; -- cgit v1.2.3 From 71d19214776e61b33da48f7c1b46e522c7f78221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Sun, 26 Apr 2020 09:15:25 -0700 Subject: bpf: add bpf_ktime_get_boot_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a device like a cellphone which is constantly suspending and resuming CLOCK_MONOTONIC is not particularly useful for keeping track of or reacting to external network events. Instead you want to use CLOCK_BOOTTIME. Hence add bpf_ktime_get_boot_ns() as a mirror of bpf_ktime_get_ns() based around CLOCK_BOOTTIME instead of CLOCK_MONOTONIC. Signed-off-by: Maciej Å»enczykowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ 3 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0cc91805069a..6aa11de67315 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9a6b23387d02..5c0290e0696e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -155,6 +155,18 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_boot_ns) +{ + /* NMI safe access to clock boottime */ + return ktime_get_boot_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { + .func = bpf_ktime_get_boot_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -615,6 +627,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; default: break; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..e875c95d3ced 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: -- cgit v1.2.3 From 6f8a57ccf8511724e6f48d732cb2940889789ab2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Apr 2020 12:58:50 -0700 Subject: bpf: Make verifier log more relevant by default To make BPF verifier verbose log more releavant and easier to use to debug verification failures, "pop" parts of log that were successfully verified. This has effect of leaving only verifier logs that correspond to code branches that lead to verification failure, which in practice should result in much shorter and more relevant verifier log dumps. This behavior is made the default behavior and can be overriden to do exhaustive logging by specifying BPF_LOG_LEVEL2 log level. Using BPF_LOG_LEVEL2 to disable this behavior is not ideal, because in some cases it's good to have BPF_LOG_LEVEL2 per-instruction register dump verbosity, but still have only relevant verifier branches logged. But for this patch, I didn't want to add any new flags. It might be worth-while to just rethink how BPF verifier logging is performed and requested and streamline it a bit. But this trimming of successfully verified branches seems to be useful and a good default behavior. To test this, I modified runqslower slightly to introduce read of uninitialized stack variable. Log (**truncated in the middle** to save many lines out of this commit message) BEFORE this change: ; int handle__sched_switch(u64 *ctx) 0: (bf) r6 = r1 ; struct task_struct *prev = (struct task_struct *)ctx[1]; 1: (79) r1 = *(u64 *)(r6 +8) func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct' 2: (b7) r2 = 0 ; struct event event = {}; 3: (7b) *(u64 *)(r10 -24) = r2 last_idx 3 first_idx 0 regs=4 stack=0 before 2: (b7) r2 = 0 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 ; if (prev->state == TASK_RUNNING) [ ... instruction dump from insn #7 through #50 are cut out ... ] 51: (b7) r2 = 16 52: (85) call bpf_get_current_comm#16 last_idx 52 first_idx 42 regs=4 stack=0 before 51: (b7) r2 = 16 ; bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, 53: (bf) r1 = r6 54: (18) r2 = 0xffff8881f3868800 56: (18) r3 = 0xffffffff 58: (bf) r4 = r7 59: (b7) r5 = 32 60: (85) call bpf_perf_event_output#25 last_idx 60 first_idx 53 regs=20 stack=0 before 59: (b7) r5 = 32 61: (bf) r2 = r10 ; event.pid = pid; 62: (07) r2 += -16 ; bpf_map_delete_elem(&start, &pid); 63: (18) r1 = 0xffff8881f3868000 65: (85) call bpf_map_delete_elem#3 ; } 66: (b7) r0 = 0 67: (95) exit from 44 to 66: safe from 34 to 66: safe from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; bpf_map_update_elem(&start, &pid, &ts, 0); 28: (bf) r2 = r10 ; 29: (07) r2 += -16 ; tsp = bpf_map_lookup_elem(&start, &pid); 30: (18) r1 = 0xffff8881f3868000 32: (85) call bpf_map_lookup_elem#1 invalid indirect read from stack off -16+0 size 4 processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4 Notice how there is a successful code path from instruction 0 through 67, few successfully verified jumps (44->66, 34->66), and only after that 11->28 jump plus error on instruction #32. AFTER this change (full verifier log, **no truncation**): ; int handle__sched_switch(u64 *ctx) 0: (bf) r6 = r1 ; struct task_struct *prev = (struct task_struct *)ctx[1]; 1: (79) r1 = *(u64 *)(r6 +8) func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct' 2: (b7) r2 = 0 ; struct event event = {}; 3: (7b) *(u64 *)(r10 -24) = r2 last_idx 3 first_idx 0 regs=4 stack=0 before 2: (b7) r2 = 0 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 ; if (prev->state == TASK_RUNNING) 7: (79) r2 = *(u64 *)(r1 +16) ; if (prev->state == TASK_RUNNING) 8: (55) if r2 != 0x0 goto pc+19 R1_w=ptr_task_struct(id=0,off=0,imm=0) R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; trace_enqueue(prev->tgid, prev->pid); 9: (61) r1 = *(u32 *)(r1 +1184) 10: (63) *(u32 *)(r10 -4) = r1 ; if (!pid || (targ_pid && targ_pid != pid)) 11: (15) if r1 == 0x0 goto pc+16 from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; bpf_map_update_elem(&start, &pid, &ts, 0); 28: (bf) r2 = r10 ; 29: (07) r2 += -16 ; tsp = bpf_map_lookup_elem(&start, &pid); 30: (18) r1 = 0xffff8881db3ce800 32: (85) call bpf_map_lookup_elem#1 invalid indirect read from stack off -16+0 size 4 processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4 Notice how in this case, there are 0-11 instructions + jump from 11 to 28 is recorded + 28-32 instructions with error on insn #32. test_verifier test runner was updated to specify BPF_LOG_LEVEL2 for VERBOSE_ACCEPT expected result due to potentially "incomplete" success verbose log at BPF_LOG_LEVEL1. On success, verbose log will only have a summary of number of processed instructions, etc, but no branch tracing log. Having just a last succesful branch tracing seemed weird and confusing. Having small and clean summary log in success case seems quite logical and nice, though. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200423195850.1259827-1-andriin@fb.com --- kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15ba8bf92ca9..91728e0f27eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,6 +168,8 @@ struct bpf_verifier_stack_elem { int insn_idx; int prev_insn_idx; struct bpf_verifier_stack_elem *next; + /* length of verifier log at the time this state was pushed on stack */ + u32 log_pos; }; #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 @@ -283,6 +285,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, log->ubuf = NULL; } +static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) +{ + char zero = 0; + + if (!bpf_verifier_log_needed(log)) + return; + + log->len_used = new_pos; + if (put_user(zero, log->ubuf + new_pos)) + log->ubuf = NULL; +} + /* log_level controls verbosity level of eBPF verifier. * bpf_verifier_log_write() is used to dump the verification trace to the log, * so the user can figure out what's wrong with the program @@ -846,7 +860,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, - int *insn_idx) + int *insn_idx, bool pop_log) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem, *head = env->head; @@ -860,6 +874,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (err) return err; } + if (pop_log) + bpf_vlog_reset(&env->log, head->log_pos); if (insn_idx) *insn_idx = head->insn_idx; if (prev_insn_idx) @@ -887,6 +903,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; + elem->log_pos = env->log.len_used; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -915,7 +932,7 @@ err: free_verifier_state(env->cur_state, true); env->cur_state = NULL; /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); return NULL; } @@ -8407,6 +8424,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; @@ -8683,7 +8701,7 @@ static int do_check(struct bpf_verifier_env *env) process_bpf_exit: update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, - &env->insn_idx); + &env->insn_idx, pop_log); if (err < 0) { if (err != -ENOENT) return err; @@ -10206,6 +10224,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) static int do_check_common(struct bpf_verifier_env *env, int subprog) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -10268,7 +10287,9 @@ out: free_verifier_state(env->cur_state, true); env->cur_state = NULL; } - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); + if (!ret && pop_log) + bpf_vlog_reset(&env->log, 0); free_states(env); if (ret) /* clean aux data in case subprog was rejected */ -- cgit v1.2.3 From 26363af5643490a817272e1cc6f1d3f1d550a699 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:35 +0200 Subject: mm: remove watermark_boost_factor_sysctl_handler watermark_boost_factor_sysctl_handler is just a pointless wrapper for proc_dointvec_minmax, so remove it and use proc_dointvec_minmax directly. Signed-off-by: Christoph Hellwig Acked-by: David Rientjes Signed-off-by: Al Viro --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..99d27acf4646 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1491,7 +1491,7 @@ static struct ctl_table vm_table[] = { .data = &watermark_boost_factor, .maxlen = sizeof(watermark_boost_factor), .mode = 0644, - .proc_handler = watermark_boost_factor_sysctl_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { -- cgit v1.2.3 From 2374c09b1c8a883bb9b4b2fc3756703eeb618f4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:36 +0200 Subject: sysctl: remove all extern declaration from sysctl.c Extern declarations in .c files are a bad style and can lead to mismatches. Use existing definitions in headers where they exist, and otherwise move the external declarations to suitable header files. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 45 +++------------------------------------------ 1 file changed, 3 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99d27acf4646..31b934865ebc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,6 +68,9 @@ #include #include #include +#include +#include +#include #include "../lib/kstrtox.h" @@ -103,22 +106,6 @@ #if defined(CONFIG_SYSCTL) -/* External variables not in a header file. */ -extern int suid_dumpable; -#ifdef CONFIG_COREDUMP -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; -#endif -extern int pid_max; -extern int pid_max_min, pid_max_max; -extern int percpu_pagelist_fraction; -extern int latencytop_enabled; -extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; -#ifndef CONFIG_MMU -extern int sysctl_nr_trim_pages; -#endif - /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; @@ -160,24 +147,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_INOTIFY_USER #include #endif -#ifdef CONFIG_SPARC -#endif - -#ifdef CONFIG_PARISC -extern int pwrsw_enabled; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW -extern int unaligned_enabled; -#endif - -#ifdef CONFIG_IA64 -extern int unaligned_dump_stack; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN -extern int no_unaligned_warning; -#endif #ifdef CONFIG_PROC_SYSCTL @@ -243,14 +212,6 @@ static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; static struct ctl_table debug_table[]; static struct ctl_table dev_table[]; -extern struct ctl_table random_table[]; -#ifdef CONFIG_EPOLL -extern struct ctl_table epoll_table[]; -#endif - -#ifdef CONFIG_FW_LOADER_USER_HELPER -extern struct ctl_table firmware_config_table[]; -#endif #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -- cgit v1.2.3 From f461d2dcd511c020a26d4d791fae595c65ed09b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:37 +0200 Subject: sysctl: avoid forward declarations Move the sysctl tables to the end of the file to avoid lots of pointless forward declarations. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 3573 +++++++++++++++++++++++++++---------------------------- 1 file changed, 1768 insertions(+), 1805 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 31b934865ebc..3fafca3ced98 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -176,79 +176,13 @@ enum sysctl_writes_mode { }; static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; - -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_COMPACTION -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos); -#endif -#endif - -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -#ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -static struct ctl_table kern_table[]; -static struct ctl_table vm_table[]; -static struct ctl_table fs_table[]; -static struct ctl_table debug_table[]; -static struct ctl_table dev_table[]; +#endif /* CONFIG_PROC_SYSCTL */ #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif -/* The default sysctl tables: */ - -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; - #ifdef CONFIG_SCHED_DEBUG static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ @@ -265,1676 +199,12 @@ static int min_extfrag_threshold; static int max_extfrag_threshold = 1000; #endif -static struct ctl_table kern_table[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHED_DEBUG - { - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_latency_ns", - .data = &sysctl_sched_latency, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_wakeup_granularity_ns", - .data = &sysctl_sched_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, -#ifdef CONFIG_SMP - { - .procname = "sched_tunable_scaling", - .data = &sysctl_sched_tunable_scaling, - .maxlen = sizeof(enum sched_tunable_scaling), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_tunable_scaling, - .extra2 = &max_sched_tunable_scaling, - }, - { - .procname = "sched_migration_cost_ns", - .data = &sysctl_sched_migration_cost, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_nr_migrate", - .data = &sysctl_sched_nr_migrate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHEDSTATS - { - .procname = "sched_schedstats", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_schedstats, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SCHEDSTATS */ -#endif /* CONFIG_SMP */ -#ifdef CONFIG_NUMA_BALANCING - { - .procname = "numa_balancing_scan_delay_ms", - .data = &sysctl_numa_balancing_scan_delay, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_min_ms", - .data = &sysctl_numa_balancing_scan_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_max_ms", - .data = &sysctl_numa_balancing_scan_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_size_mb", - .data = &sysctl_numa_balancing_scan_size, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "numa_balancing", - .data = NULL, /* filled in by handler */ - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_numa_balancing, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ - { - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rr_timeslice_ms", - .data = &sysctl_sched_rr_timeslice, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rr_handler, - }, -#ifdef CONFIG_UCLAMP_TASK - { - .procname = "sched_util_clamp_min", - .data = &sysctl_sched_uclamp_util_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_max", - .data = &sysctl_sched_uclamp_util_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, -#endif -#ifdef CONFIG_SCHED_AUTOGROUP - { - .procname = "sched_autogroup_enabled", - .data = &sysctl_sched_autogroup_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", - .data = &sysctl_sched_cfs_bandwidth_slice, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, -#endif -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", - .data = &sysctl_sched_energy_aware, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_energy_aware_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "panic", - .data = &panic_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_COREDUMP - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", - .maxlen = sizeof(long), - .mode = 0644, - .proc_handler = proc_taint, - }, - { - .procname = "sysctl_writes_strict", - .data = &sysctl_writes_strict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_latencytop, - }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "print-fatal-signals", - .data = &print_fatal_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SPARC - { - .procname = "reboot-cmd", - .data = reboot_command, - .maxlen = 256, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "stop-a", - .data = &stop_a_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "scons-poweroff", - .data = &scons_pwroff, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SPARC64 - { - .procname = "tsb-ratio", - .data = &sysctl_tsb_ratio, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_PARISC - { - .procname = "soft-power", - .data = &pwrsw_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW - { - .procname = "unaligned-trap", - .data = &unaligned_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif -#ifdef CONFIG_STACK_TRACER - { - .procname = "stack_tracer_enabled", - .data = &stack_tracer_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = stack_trace_sysctl, - }, -#endif -#ifdef CONFIG_TRACING - { - .procname = "ftrace_dump_on_oops", - .data = &ftrace_dump_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "traceoff_on_warning", - .data = &__disable_trace_on_warning, - .maxlen = sizeof(__disable_trace_on_warning), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tracepoint_printk", - .data = &tracepoint_printk, - .maxlen = sizeof(tracepoint_printk), - .mode = 0644, - .proc_handler = tracepoint_printk_sysctl, - }, -#endif -#ifdef CONFIG_KEXEC_CORE - { - .procname = "kexec_load_disabled", - .data = &kexec_load_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_MODULES - { - .procname = "modprobe", - .data = &modprobe_path, - .maxlen = KMOD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "modules_disabled", - .data = &modules_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_UEVENT_HELPER - { - .procname = "hotplug", - .data = &uevent_helper, - .maxlen = UEVENT_HELPER_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MAGIC_SYSRQ - { - .procname = "sysrq", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = sysrq_sysctl_handler, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "cad_pid", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0600, - .proc_handler = proc_do_cad_pid, - }, -#endif - { - .procname = "threads-max", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_max_threads, - }, - { - .procname = "random", - .mode = 0555, - .child = random_table, - }, - { - .procname = "usermodehelper", - .mode = 0555, - .child = usermodehelper_table, - }, -#ifdef CONFIG_FW_LOADER_USER_HELPER - { - .procname = "firmware_config", - .mode = 0555, - .child = firmware_config_table, - }, -#endif - { - .procname = "overflowuid", - .data = &overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_S390 - { - .procname = "userprocess_debug", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "pid_max", - .data = &pid_max, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, - { - .procname = "panic_on_oops", - .data = &panic_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_print", - .data = &panic_print, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &ten_thousand, - }, - { - .procname = "printk_devkmsg", - .data = devkmsg_log_str, - .maxlen = DEVKMSG_STR_MAX_SIZE, - .mode = 0644, - .proc_handler = devkmsg_sysctl_set_loglvl, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#endif - { - .procname = "ngroups_max", - .data = &ngroups_max, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "cap_last_cap", - .data = (void *)&cap_last_cap, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = &sixty, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = &sysctl_hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_DEBUG_STACKOVERFLOW - { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_MMU) - { - .procname = "randomize_va_space", - .data = &randomize_va_space, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN - { - .procname = "ignore-unaligned-usertrap", - .data = &no_unaligned_warning, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_IA64 - { - .procname = "unaligned-dump-stack", - .data = &unaligned_dump_stack, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DETECT_HUNG_TASK - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_check_interval_secs", - .data = &sysctl_hung_task_check_interval_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, - }, -#endif -#ifdef CONFIG_RT_MUTEXES - { - .procname = "max_lock_depth", - .data = &max_lock_depth, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#ifdef CONFIG_KEYS - { - .procname = "keys", - .mode = 0555, - .child = key_sysctls, - }, -#endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_proc_update_handler, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "perf_cpu_time_max_percent", - .data = &sysctl_perf_cpu_time_max_percent, - .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), - .mode = 0644, - .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "perf_event_max_stack", - .data = &sysctl_perf_event_max_stack, - .maxlen = sizeof(sysctl_perf_event_max_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &six_hundred_forty_kb, - }, - { - .procname = "perf_event_max_contexts_per_stack", - .data = &sysctl_perf_event_max_contexts_per_stack, - .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_thousand, - }, -#endif - { - .procname = "panic_on_warn", - .data = &panic_on_warn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = timer_migration_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_BPF_SYSCALL - { - .procname = "unprivileged_bpf_disabled", - .data = &sysctl_unprivileged_bpf_disabled, - .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "bpf_stats_enabled", - .data = &bpf_stats_enabled_key.key, - .maxlen = sizeof(bpf_stats_enabled_key), - .mode = 0644, - .proc_handler = proc_do_static_key, - }, -#endif -#if defined(CONFIG_TREE_RCU) - { - .procname = "panic_on_rcu_stall", - .data = &sysctl_panic_on_rcu_stall, - .maxlen = sizeof(sysctl_panic_on_rcu_stall), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = overcommit_ratio_handler, - }, - { - .procname = "overcommit_kbytes", - .data = &sysctl_overcommit_kbytes, - .maxlen = sizeof(sysctl_overcommit_kbytes), - .mode = 0644, - .proc_handler = overcommit_kbytes_handler, - }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, -#ifdef CONFIG_HUGETLB_PAGE - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_sysctl_handler, - }, -#ifdef CONFIG_NUMA - { - .procname = "nr_hugepages_mempolicy", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_mempolicy_sysctl_handler, - }, - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_overcommit_handler, - }, -#endif - { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = &four, - }, -#ifdef CONFIG_COMPACTION - { - .procname = "compact_memory", - .data = &sysctl_compact_memory, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = sysctl_compaction_handler, - }, - { - .procname = "extfrag_threshold", - .data = &sysctl_extfrag_threshold, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, - }, - { - .procname = "compact_unevictable_allowed", - .data = &sysctl_compact_unevictable_allowed, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_warn_RT_change, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - -#endif /* CONFIG_COMPACTION */ - { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_scale_factor", - .data = &watermark_scale_factor, - .maxlen = sizeof(watermark_scale_factor), - .mode = 0644, - .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, - }, - { - .procname = "percpu_pagelist_fraction", - .data = &percpu_pagelist_fraction, - .maxlen = sizeof(percpu_pagelist_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#else - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "block_dump", - .data = &block_dump, - .maxlen = sizeof(block_dump), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &node_reclaim_mode, - .maxlen = sizeof(node_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, -#endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "stat_refresh", - .data = NULL, - .maxlen = 0, - .mode = 0600, - .proc_handler = vmstat_refresh, - }, -#endif -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .procname = "vdso_enabled", -#ifdef CONFIG_X86_32 - .data = &vdso32_enabled, - .maxlen = sizeof(vdso32_enabled), -#else - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), -#endif - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - { - .procname = "memory_failure_early_kill", - .data = &sysctl_memory_failure_early_kill, - .maxlen = sizeof(sysctl_memory_failure_early_kill), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "memory_failure_recovery", - .data = &sysctl_memory_failure_recovery, - .maxlen = sizeof(sysctl_memory_failure_recovery), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, - .maxlen = sizeof(sysctl_user_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "admin_reserve_kbytes", - .data = &sysctl_admin_reserve_kbytes, - .maxlen = sizeof(sysctl_admin_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS - { - .procname = "mmap_rnd_bits", - .data = &mmap_rnd_bits, - .maxlen = sizeof(mmap_rnd_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_bits_min, - .extra2 = (void *)&mmap_rnd_bits_max, - }, -#endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS - { - .procname = "mmap_rnd_compat_bits", - .data = &mmap_rnd_compat_bits, - .maxlen = sizeof(mmap_rnd_compat_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_compat_bits_min, - .extra2 = (void *)&mmap_rnd_compat_bits_max, - }, -#endif -#ifdef CONFIG_USERFAULTFD - { - .procname = "unprivileged_userfaultfd", - .data = &sysctl_unprivileged_userfaultfd, - .maxlen = sizeof(sysctl_unprivileged_userfaultfd), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero_ul, - .extra2 = &long_max, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif -#endif - { - .procname = "protected_symlinks", - .data = &sysctl_protected_symlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_hardlinks", - .data = &sysctl_protected_hardlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_fifos", - .data = &sysctl_protected_fifos, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "protected_regular", - .data = &sysctl_protected_regular, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = sysctl_mount_point, - }, -#endif - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(pipe_max_size), - .mode = 0644, - .proc_handler = proc_dopipe_max_size, - }, - { - .procname = "pipe-user-pages-hard", - .data = &pipe_user_pages_hard, - .maxlen = sizeof(pipe_user_pages_hard), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "pipe-user-pages-soft", - .data = &pipe_user_pages_soft, - .maxlen = sizeof(pipe_user_pages_soft), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "mount-max", - .data = &sysctl_mount_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { } -}; - -static struct ctl_table debug_table[] = { -#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE - { - .procname = "exception-trace", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table dev_table[] = { - { } -}; - -int __init sysctl_init(void) -{ - struct ctl_table_header *hdr; - - hdr = register_sysctl_table(sysctl_base_table); - kmemleak_not_leak(hdr); - return 0; -} - -#endif /* CONFIG_SYSCTL */ - -/* - * /proc/sys support - */ - +#endif /* CONFIG_SYSCTL */ + +/* + * /proc/sys support + */ + #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, @@ -3301,101 +1571,1794 @@ int proc_dostring(struct ctl_table *table, int write, return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } -int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +#endif /* CONFIG_PROC_SYSCTL */ + +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - return -ENOSYS; + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (val) + static_key_enable(key); + else + static_key_disable(key); + } + mutex_unlock(&static_key_mutex); + return ret; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table kern_table[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_wakeup_granularity_ns", + .data = &sysctl_sched_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, +#ifdef CONFIG_SMP + { + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, + }, + { + .procname = "sched_migration_cost_ns", + .data = &sysctl_sched_migration_cost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_nr_migrate", + .data = &sysctl_sched_nr_migrate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "numa_balancing", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_balancing, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_COREDUMP + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_taint, + }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_LATENCYTOP + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_latencytop, + }, +#endif +#ifdef CONFIG_BLK_DEV_INITRD + { + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SPARC + { + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "scons-poweroff", + .data = &scons_pwroff, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SPARC64 + { + .procname = "tsb-ratio", + .data = &sysctl_tsb_ratio, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PARISC + { + .procname = "soft-power", + .data = &pwrsw_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW + { + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_FUNCTION_TRACER + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, +#endif +#ifdef CONFIG_STACK_TRACER + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +#endif +#ifdef CONFIG_TRACING + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +#endif +#ifdef CONFIG_KEXEC_CORE + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_MODULES + { + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_UEVENT_HELPER + { + .procname = "hotplug", + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#endif +#ifdef CONFIG_CHR_DEV_SG + { + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT + { + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .procname = "sysrq", + .data = NULL, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = sysrq_sysctl_handler, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .data = NULL, + .maxlen = sizeof (int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif + { + .procname = "threads-max", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_max_threads, + }, + { + .procname = "random", + .mode = 0555, + .child = random_table, + }, + { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, +#ifdef CONFIG_FW_LOADER_USER_HELPER + { + .procname = "firmware_config", + .mode = 0555, + .child = firmware_config_table, + }, +#endif + { + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_S390 + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "pid_max", + .data = &pid_max, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + { + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#if defined CONFIG_PRINTK + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &ten_thousand, + }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, +#endif + { + .procname = "ngroups_max", + .data = &ngroups_max, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "cap_last_cap", + .data = (void *)&cap_last_cap, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_LOCKUP_DETECTOR) + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog_thresh, + .extra1 = SYSCTL_ZERO, + .extra2 = &sixty, + }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { + .procname = "soft_watchdog", + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_X86) + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_MMU) + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN + { + .procname = "ignore-unaligned-usertrap", + .data = &no_unaligned_warning, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_IA64 + { + .procname = "unaligned-dump-stack", + .data = &unaligned_dump_stack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + }, +#endif +#ifdef CONFIG_RT_MUTEXES + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#ifdef CONFIG_KEYS + { + .procname = "keys", + .mode = 0555, + .child = key_sysctls, + }, +#endif +#ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_proc_update_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, +#endif + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_BPF_SYSCALL + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = proc_do_static_key, + }, +#endif +#if defined(CONFIG_TREE_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table vm_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, + }, + { + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = &one_ul, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = &dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirtytime_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, +#ifdef CONFIG_HUGETLB_PAGE + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + }, + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + }, +#endif + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = &four, + }, +#ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", + .data = &sysctl_compact_memory, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_extfrag_threshold, + .extra2 = &max_extfrag_threshold, + }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_warn_RT_change, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + +#endif /* CONFIG_COMPACTION */ + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = &one_thousand, + }, + { + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_MMU + { + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#else + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "block_dump", + .data = &block_dump, + .maxlen = sizeof(block_dump), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) + { + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, +#endif +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, +#endif +#ifdef CONFIG_MMU + { + .procname = "mmap_min_addr", + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = mmap_min_addr_handler, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, +#endif +#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ + (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) + { + .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), +#endif + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + { + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table fs_table[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &long_max, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_FILE_LOCKING + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DNOTIFY + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_AIO + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif /* CONFIG_AIO */ +#ifdef CONFIG_INOTIFY_USER + { + .procname = "inotify", + .mode = 0555, + .child = inotify_table, + }, +#endif +#ifdef CONFIG_EPOLL + { + .procname = "epoll", + .mode = 0555, + .child = epoll_table, + }, +#endif +#endif + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_coredump, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) + { + .procname = "binfmt_misc", + .mode = 0555, + .child = sysctl_mount_point, + }, +#endif + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(pipe_max_size), + .mode = 0644, + .proc_handler = proc_dopipe_max_size, + }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { } +}; -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table debug_table[] = { +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE + { + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table dev_table[] = { + { } +}; -#endif /* CONFIG_PROC_SYSCTL */ +static struct ctl_table sysctl_base_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { } +}; -#if defined(CONFIG_SYSCTL) -int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int __init sysctl_init(void) { - struct static_key *key = (struct static_key *)table->data; - static DEFINE_MUTEX(static_key_mutex); - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; + struct ctl_table_header *hdr; - mutex_lock(&static_key_mutex); - val = static_key_enabled(key); - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (val) - static_key_enable(key); - else - static_key_disable(key); - } - mutex_unlock(&static_key_mutex); - return ret; + hdr = register_sysctl_table(sysctl_base_table); + kmemleak_not_leak(hdr); + return 0; } -#endif +#endif /* CONFIG_SYSCTL */ /* * No sense putting this after each symbol definition, twice, * exception granted :-) -- cgit v1.2.3 From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:38 +0200 Subject: sysctl: pass kernel pointers to ->proc_handler Instead of having all the sysctl handlers deal with user pointers, which is rather hairy in terms of the BPF interaction, copy the input to and from userspace in common code. This also means that the strings are always NUL-terminated by the common code, making the API a little bit safer. As most handler just pass through the data to one of the common handlers a lot of the changes are mechnical. Signed-off-by: Christoph Hellwig Acked-by: Andrey Ignatov Signed-off-by: Al Viro --- kernel/bpf/cgroup.c | 35 +++---- kernel/events/callchain.c | 2 +- kernel/events/core.c | 6 +- kernel/kprobes.c | 2 +- kernel/latencytop.c | 4 +- kernel/pid_namespace.c | 2 +- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 9 +- kernel/sched/fair.c | 3 +- kernel/sched/rt.c | 10 +- kernel/sched/topology.c | 2 +- kernel/seccomp.c | 2 +- kernel/sysctl.c | 239 ++++++++++++++++++---------------------------- kernel/time/timer.c | 3 +- kernel/trace/trace.c | 2 +- kernel/umh.c | 2 +- kernel/utsname_sysctl.c | 2 +- kernel/watchdog.c | 12 +-- 18 files changed, 132 insertions(+), 207 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..977bc69bb1c5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1137,16 +1137,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1154,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1169,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1201,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..bdb1533ada81 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -236,7 +236,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..f86d46f2c4d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..ffbe03a45c16 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..3ccaba5f15c0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..471f649b5868 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5c589a2e4d19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; int old_min, old_max; @@ -2723,7 +2722,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2797,8 +2796,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..b6077fd5b32f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..45da29de3ecc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2754,9 +2753,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..fa64b2ee9fe6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -209,7 +209,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..d653d8426de9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3fafca3ced98..e961286d0e14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -208,12 +208,10 @@ static int max_extfrag_threshold = 1000; #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) + char *buffer, size_t *lenp, loff_t *ppos) { size_t len; - char __user *p; - char c; + char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; @@ -238,8 +236,7 @@ static int _proc_do_string(char *data, int maxlen, int write, *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; + c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; @@ -261,11 +258,9 @@ static int _proc_do_string(char *data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; + memcpy(buffer, data, len); if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; + buffer[len] = '\n'; len++; } *lenp = len; @@ -326,13 +321,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * Returns 0 on success. */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); } static size_t proc_skip_spaces(char **buf) @@ -463,11 +458,10 @@ static int proc_get_long(char **buf, size_t *size, * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. + * In case of success @buf and @size are updated with the amount of bytes + * written. */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; @@ -476,24 +470,22 @@ static int proc_put_long(void __user **buf, size_t *size, unsigned long val, len = strlen(tmp); if (len > *size) len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; + memcpy(*buf, tmp, len); *size -= len; *buf += len; - return 0; } #undef TMPBUFLEN -static int proc_put_char(void __user **buf, size_t *size, char c) +static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; *buf = *buffer; } - return 0; } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, @@ -541,7 +533,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -549,7 +541,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, { int *i, vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -569,9 +561,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first=0) { @@ -598,24 +588,17 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, break; } if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err && left) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -623,7 +606,7 @@ out: } static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -634,7 +617,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -645,7 +628,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, int err = 0; size_t left; bool neg; - char *kbuf = NULL, *p; + char *p = buffer; left = *lenp; @@ -655,10 +638,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - left -= proc_skip_spaces(&p); if (!left) { err = -EINVAL; @@ -682,7 +661,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, left -= proc_skip_spaces(&p); out_free: - kfree(kbuf); if (err) return -EINVAL; @@ -694,7 +672,7 @@ bail_early: return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -712,11 +690,11 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, goto out; } - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) + proc_put_long(&buffer, &left, lval, false); + if (!left) goto out; - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; @@ -726,7 +704,7 @@ out: } static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -762,7 +740,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, } static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), @@ -785,16 +763,15 @@ static int do_proc_douintvec(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } #ifdef CONFIG_COMPACTION static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) + int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, old; @@ -826,8 +803,8 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, * * Returns 0 on success. */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); @@ -838,7 +815,7 @@ int proc_douintvec(struct ctl_table *table, int write, * This means we can safely use a temporary. */ static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long tmptaint = get_taint(); @@ -870,7 +847,7 @@ static int proc_taint(struct ctl_table *table, int write, #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -936,7 +913,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, @@ -1005,7 +982,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, @@ -1036,7 +1013,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, } static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); @@ -1057,7 +1034,7 @@ static void validate_coredump_safety(void) } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) @@ -1067,7 +1044,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dostring(table, write, buffer, lenp, ppos); if (!error) @@ -1078,7 +1055,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int tmp, ret; @@ -1096,16 +1073,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, } #endif -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -1124,9 +1099,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first = 0) { @@ -1154,26 +1127,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -1181,10 +1146,8 @@ out: } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); @@ -1207,7 +1170,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } @@ -1230,8 +1193,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); @@ -1325,7 +1287,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success. */ int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); @@ -1347,7 +1309,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); @@ -1369,15 +1331,15 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct pid *new_pid; pid_t tmp; @@ -1416,7 +1378,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; bool first = 1; @@ -1432,7 +1394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - char *kbuf, *p; + char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { @@ -1441,15 +1403,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, skipped = *lenp - left; } - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); + if (!tmp_bitmap) return -ENOMEM; - } proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; @@ -1513,7 +1469,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, first = 0; proc_skip_char(&p, &left, '\n'); } - kfree(kbuf); left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -1525,27 +1480,17 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); } if (!err) { @@ -1566,68 +1511,67 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } @@ -1636,8 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #if defined(CONFIG_SYSCTL) int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..398e6eadb861 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -249,8 +249,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..167a74a15b1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..9788ed481a6a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..53ff2c81b084 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -661,7 +661,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; -- cgit v1.2.3 From 23b5ae2e8e1326c91b5dfdbb6ebcd5a6820074ae Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 17 Apr 2020 22:50:31 +0800 Subject: locking/rtmutex: Remove unused rt_mutex_cmpxchg_relaxed() Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1587135032-188866-1-git-send-email-alex.shi@linux.alibaba.com --- kernel/locking/rtmutex.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c9f090d64f00..cfdd5b93264d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -141,7 +141,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) @@ -202,7 +201,6 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) # define rt_mutex_cmpxchg_acquire(l,c,n) (0) # define rt_mutex_cmpxchg_release(l,c,n) (0) -- cgit v1.2.3 From 353159365e725fab7b3bf2817bec0ecd16706a38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2020 16:36:29 -0700 Subject: rcu: Add KCSAN stubs This commit adds stubs for KCSAN's data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros to allow code using these macros to move ahead. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9a49cd6065a..156ac8d0418b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -67,6 +67,19 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Data structures. */ /* -- cgit v1.2.3 From 4f58820fd710e0563e22420c07c03c5ccec948bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2020 16:33:01 -0700 Subject: srcu: Add KCSAN stubs This commit adds stubs for KCSAN's data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros to allow code using these macros to move ahead. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0c71505f0e19..ba2b751e9168 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,6 +29,19 @@ #include "rcu.h" #include "rcu_segcblist.h" +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; -- cgit v1.2.3 From 2f08469563550d15cb08a60898d3549720600eee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Feb 2020 05:29:58 -0800 Subject: rcu: Mark rcu_state.ncpus to detect concurrent writes The rcu_state structure's ncpus field is only to be modified by the CPU-hotplug CPU-online code path, which is single-threaded. This commit therefore enlists KCSAN's help in enforcing this restriction. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 156ac8d0418b..c88240a685ec 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3612,6 +3612,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); -- cgit v1.2.3 From 314eeb43e5f22856b281c91c966e51e5782a3498 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 14:18:12 -0800 Subject: rcu: Add *_ONCE() and data_race() to rcu_node ->exp_tasks plus locking There are lockless loads from the rcu_node structure's ->exp_tasks field, so this commit causes all stores to use WRITE_ONCE() and all lockless loads to use READ_ONCE() or data_race(), with the latter for debug prints. This code also did a unprotected traversal of the linked list pointed into by ->exp_tasks, so this commit also acquires the rcu_node structure's ->lock to properly protect this traversal. This list was traversed unprotected only when printing an RCU CPU stall warning for an expedited grace period, so the odds of seeing this in production are not all that high. This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 19 +++++++++++-------- kernel/rcu/tree_plugin.h | 8 ++++---- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..c2b04daf1190 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -150,7 +150,7 @@ static void __maybe_unused sync_exp_reset_tree(void) static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -373,7 +373,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) * until such time as the ->expmask bits are cleared. */ if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } @@ -721,17 +721,20 @@ static void sync_sched_exp_online_cleanup(int cpu) */ static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - struct task_struct *t; + unsigned long flags; int ndetected = 0; + struct task_struct *t; - if (!rnp->exp_tasks) + if (!READ_ONCE(rnp->exp_tasks)) return 0; + raw_spin_lock_irqsave_rcu_node(rnp, flags); t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); ndetected++; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ndetected; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 097635c41135..35d77db035bd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -226,7 +226,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -500,7 +500,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (&t->rcu_node_entry == rnp->gp_tasks) WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; @@ -761,7 +761,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, - rnp->exp_tasks); + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -1036,7 +1036,7 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(rnp->boost_tasks || READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); -- cgit v1.2.3 From 065a6db12a80fac4eccc37e6bef14b0f4a610f31 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:22:01 -0800 Subject: rcu: Add READ_ONCE and data_race() to rcu_node ->boost_tasks The rcu_node structure's ->boost_tasks field is read locklessly, so this commit adds the READ_ONCE() to one load in order to avoid destructive compiler optimizations. The other load is from a diagnostic print, so data_race() suffices. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 35d77db035bd..ed6bb460c7c6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -760,7 +760,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, + __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; @@ -1036,7 +1036,8 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || READ_ONCE(rnp->exp_tasks)); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); -- cgit v1.2.3 From b68c6146512d92f6d570d26e1873497ade2cc4cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 16:36:59 -0800 Subject: srcu: Add data_race() to ->srcu_lock_count and ->srcu_unlock_count arrays The srcu_data structure's ->srcu_lock_count and ->srcu_unlock_count arrays are read and written locklessly, so this commit adds the data_race() to the diagnostic-print loads from these arrays in order mark them as known and approved data-racy accesses. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being used only by rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ba2b751e9168..6d3ef700fb0e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1281,8 +1281,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); /* * Make sure that a lock is always counted if the corresponding @@ -1290,8 +1290,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); c0 = l0 - u0; c1 = l1 - u1; -- cgit v1.2.3 From 5822b8126ff01e0baaf7d5168adc4ac8aeae088c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 4 Jan 2020 10:44:41 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_node ->boost_tasks The rcu_node structure's ->boost_tasks field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ed6bb460c7c6..664d0aa48edb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -505,7 +505,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -1082,7 +1082,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->qsmask == 0 && (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, READ_ONCE(rnp->boost_kthread_status)); -- cgit v1.2.3 From 47fbb074536ecca5e0af3dcce340954c09ed4d1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Feb 2020 02:29:36 -0800 Subject: rcu: Use data_race() for RCU CPU stall-warning prints Although the accesses used to determine whether or not a stall should be printed are an integral part of the concurrency algorithm governing use of the corresponding variables, the values that are simply printed are ancillary. As such, it is best to use data_race() for these accesses in order to provide the greatest latitude in the use of KCSAN for the other accesses that are an integral part of the algorithm. This commit therefore changes the relevant uses of READ_ONCE() to data_race(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..e7da1111ab0c 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -360,7 +360,7 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - READ_ONCE(rcu_state.gp_flags), + data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { @@ -421,10 +421,10 @@ static void print_other_cpu_stall(unsigned long gp_seq) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); + gpa = data_race(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), + data_race(jiffies_till_next_fqs), rcu_get_root()->qsmask); /* In this case, the current CPU might be at fault. */ sched_show_task(current); @@ -581,23 +581,23 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - READ_ONCE(rcu_state.gp_activity); - jr = j - READ_ONCE(rcu_state.gp_req_activity); - jw = j - READ_ONCE(rcu_state.gp_wake_time); + ja = j - data_race(rcu_state.gp_activity); + jr = j - data_race(rcu_state.gp_req_activity); + jw = j - data_race(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, t ? t->state : 0x1ffffL, - ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), - (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rcu_get_root()->gp_seq_needed), - READ_ONCE(rcu_state.gp_flags)); + ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), + (long)data_race(rcu_state.gp_seq), + (long)data_race(rcu_get_root()->gp_seq_needed), + data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), - (long)READ_ONCE(rnp->gp_seq_needed)); + rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq), + (long)data_race(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -607,7 +607,7 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)READ_ONCE(rdp->gp_seq_needed)); + cpu, (long)data_race(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { -- cgit v1.2.3 From 1fca4d12f46371a34bf90af87f49548dd026c3ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Feb 2020 20:07:09 -0800 Subject: rcu: Expedite first two FQS scans under callback-overload conditions Even if some CPUs have excessive numbers of callbacks, RCU's grace-period kthread will still wait normally between successive force-quiescent-state scans. The first two are the most important, as they are the ones that enlist aid from the scheduler when overloaded. This commit therefore omits the wait before the first and the second force-quiescent-state scan under callback-overload conditions. This approach was inspired by a discussion with Jeff Roberson. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 19 +++++++++++++++---- kernel/rcu/tree.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c88240a685ec..0132bc6152dd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1624,12 +1624,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp) { struct rcu_node *rnp = rcu_get_root(); - /* Someone like call_rcu() requested a force-quiescent-state scan. */ + // If under overload conditions, force an immediate FQS scan. + if (*gfp & RCU_GP_FLAG_OVLD) + return true; + + // Someone like call_rcu() requested a force-quiescent-state scan. *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; - /* The current grace period has completed. */ + // The current grace period has completed. if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) return true; @@ -1667,13 +1671,15 @@ static void rcu_gp_fqs(bool first_time) static void rcu_gp_fqs_loop(void) { bool first_gp_fqs; - int gf; + int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); + if (rcu_state.cbovld) + gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { if (!ret) { @@ -1698,7 +1704,11 @@ static void rcu_gp_fqs_loop(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); - first_gp_fqs = false; + gf = 0; + if (first_gp_fqs) { + first_gp_fqs = false; + gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; + } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); @@ -1718,6 +1728,7 @@ static void rcu_gp_fqs_loop(void) j = 1; else j = rcu_state.jiffies_force_qs - j; + gf = 0; } } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9dc2ec021da5..44edd0a98ffe 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -359,6 +359,7 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */ /* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ -- cgit v1.2.3 From fcbcc0e700500fcecf24b9b705825135de30415e Mon Sep 17 00:00:00 2001 From: Zhaolong Zhang Date: Thu, 5 Mar 2020 14:56:11 -0800 Subject: rcu: Fix the (t=0 jiffies) false positive It is possible that an over-long grace period will end while the RCU CPU stall warning message is printing. In this case, the estimate of the offending grace period's duration can be erroneous due to refetching of rcu_state.gp_start, which will now be the time of the newly started grace period. Computation of this duration clearly needs to use the start time for the old over-long grace period, not the fresh new one. This commit avoids such errors by causing both print_other_cpu_stall() and print_cpu_stall() to reuse the value previously fetched by their caller. Signed-off-by: Zhaolong Zhang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index e7da1111ab0c..3a7bc99e78e3 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -371,7 +371,7 @@ static void rcu_check_gp_kthread_starvation(void) } } -static void print_other_cpu_stall(unsigned long gp_seq) +static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -408,7 +408,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + smp_processor_id(), (long)(jiffies - gps), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(); @@ -442,7 +442,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(void) +static void print_cpu_stall(unsigned long gps) { int cpu; unsigned long flags; @@ -467,7 +467,7 @@ static void print_cpu_stall(void) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, + jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); rcu_check_gp_kthread_starvation(); @@ -546,7 +546,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); + print_cpu_stall(gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); @@ -555,7 +555,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); + print_other_cpu_stall(gs2, gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); } -- cgit v1.2.3 From c28d5c09d09f86374a00b70a57d3cb75e3fc7fa9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 17 Mar 2020 15:54:18 +0100 Subject: rcu: Get rid of some doc warnings in update.c This commit escapes *ret, because otherwise the documentation system thinks that this is an incomplete emphasis block: ./kernel/rcu/update.c:65: WARNING: Inline emphasis start-string without end-string. ./kernel/rcu/update.c:65: WARNING: Inline emphasis start-string without end-string. ./kernel/rcu/update.c:70: WARNING: Inline emphasis start-string without end-string. ./kernel/rcu/update.c:82: WARNING: Inline emphasis start-string without end-string. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 28a8bdc5072f..72461dd80d29 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -63,12 +63,12 @@ module_param(rcu_normal_after_boot, int, 0); * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? * @ret: Best guess answer if lockdep cannot be relied on * - * Returns true if lockdep must be ignored, in which case *ret contains + * Returns true if lockdep must be ignored, in which case ``*ret`` contains * the best guess described below. Otherwise returns false, in which - * case *ret tells the caller nothing and the caller should instead + * case ``*ret`` tells the caller nothing and the caller should instead * consult lockdep. * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set ``*ret`` to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -82,7 +82,7 @@ module_param(rcu_normal_after_boot, int, 0); * * Note that if the CPU is in the idle loop from an RCU point of view (ie: * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) - * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are * in such a section, considering these as in extended quiescent state, * so such a CPU is effectively never in an RCU read-side critical section -- cgit v1.2.3 From 62ae19511f1efbe9f57346ca1f45e13b061a56ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Mar 2020 19:52:20 -0700 Subject: rcu: Mark rcu_state.gp_seq to detect more concurrent writes The rcu_state structure's gp_seq field is only to be modified by the RCU grace-period kthread, which is single-threaded. This commit therefore enlists KCSAN's help in enforcing this restriction. This commit applies KCSAN-specific primitives, so cannot go upstream until KCSAN does. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0132bc6152dd..183b9cf33f50 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1230,7 +1230,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1519,6 +1519,7 @@ static bool rcu_gp_init(void) record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1805,6 +1806,7 @@ static void rcu_gp_cleanup(void) /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); -- cgit v1.2.3 From a66dbda7893f48b97d7406ae42fa29190aa672a0 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Fri, 27 Mar 2020 21:23:53 +0000 Subject: rcu: Replace assigned pointer ret value by corresponding boolean value Coccinelle reports warnings at rcu_read_lock_held_common() WARNING: Assignment of 0/1 to bool variable To fix this, the assigned pointer ret values are replaced by corresponding boolean value. Given that ret is a pointer of bool type Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 72461dd80d29..17f23569e21a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -98,15 +98,15 @@ module_param(rcu_normal_after_boot, int, 0); static bool rcu_read_lock_held_common(bool *ret) { if (!debug_lockdep_rcu_enabled()) { - *ret = 1; + *ret = true; return true; } if (!rcu_is_watching()) { - *ret = 0; + *ret = false; return true; } if (!rcu_lockdep_current_cpu_online()) { - *ret = 0; + *ret = false; return true; } return false; -- cgit v1.2.3 From da44cd6c8e88b6da3d5277d0e7b0e4d38faf4532 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 30 Mar 2020 02:24:48 +0100 Subject: rcu: Replace 1 by true Coccinelle reports a warning at use_softirq declaration WARNING: Assignment of 0/1 to bool variable The root cause is use_softirq a variable of bool type is initialised with the integer 1 Replacing 1 with value true solve the issue. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 183b9cf33f50..940c62acd14a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -113,7 +113,7 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = 1; +static bool use_softirq = true; module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; -- cgit v1.2.3 From 29ffebc5fcc0bcd8e9bc9f9b3899f2d222b12b04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 14:48:20 -0700 Subject: rcu: Convert ULONG_CMP_GE() to time_after() for jiffy comparison This commit converts the ULONG_CMP_GE() in rcu_gp_fqs_loop() to time_after() to reflect the fact that it is comparing a timestamp to the jiffies counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 940c62acd14a..e5903669e336 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1700,7 +1700,7 @@ static void rcu_gp_fqs_loop(void) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + if (!time_after(rcu_state.jiffies_force_qs, jiffies) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); -- cgit v1.2.3 From 7b2413111a630469282c427033818977515ea592 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 15:52:53 -0700 Subject: rcu: Convert rcu_initiate_boost() ULONG_CMP_GE() to time_after() This commit converts the ULONG_CMP_GE() in rcu_initiate_boost() to time_after() to reflect the fact that it is comparing a timestamp to the jiffies counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 664d0aa48edb..5cd27c2916c0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1080,7 +1080,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit v1.2.3 From e2f3ccfa62001994ed3e81c309face75aaa8d372 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 17:05:22 -0700 Subject: rcu: Convert rcu_nohz_full_cpu() ULONG_CMP_LT() to time_before() This commit converts the ULONG_CMP_LT() in rcu_nohz_full_cpu() to time_before() to reflect the fact that it is comparing a timestamp to the jiffies counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5cd27c2916c0..5771e32a3840 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2537,7 +2537,7 @@ static bool rcu_nohz_full_cpu(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress() || - ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; -- cgit v1.2.3 From f87dc808009ac86c790031627698ef1a34c31e25 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 16 Mar 2020 12:32:26 -0400 Subject: rcuperf: Add ability to increase object allocation size This allows us to increase memory pressure dynamically using a new rcuperf boot command line parameter called 'rcumult'. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a8d097d84d..16dd1e6b7c09 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,6 +88,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -635,7 +636,7 @@ kfree_perf_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -722,6 +723,8 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } + pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); if (kfree_reader_tasks == NULL) { -- cgit v1.2.3 From 9154244c1ab6c9db4f1f25ac8f73bd46dba64287 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 16 Mar 2020 12:32:27 -0400 Subject: rcu/tree: Add a shrinker to prevent OOM due to kfree_rcu() batching To reduce grace periods and improve kfree() performance, we have done batching recently dramatically bringing down the number of grace periods while giving us the ability to use kfree_bulk() for efficient kfree'ing. However, this has increased the likelihood of OOM condition under heavy kfree_rcu() flood on small memory systems. This patch introduces a shrinker which starts grace periods right away if the system is under memory pressure due to existence of objects that have still not started a grace period. With this patch, I do not observe an OOM anymore on a system with 512MB RAM and 8 CPUs, with the following rcuperf options: rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000 rcuperf.kfree_rcu_test=1 rcuperf.kfree_mult=2 Otherwise it easily OOMs with the above parameters. NOTE: 1. On systems with no memory pressure, the patch has no effect as intended. 2. In the future, we can use this same mechanism to prevent grace periods from happening even more, by relying on shrinkers carefully. Cc: urezki@gmail.com Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 156ac8d0418b..e299cd0ddd97 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2824,6 +2824,8 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; + // Number of objects for which GP not started + int count; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); @@ -2937,6 +2939,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krcp->head = NULL; } + krcp->count = 0; + /* * One work is per one batch, so there are two "free channels", * "bhead_free" and "head_free" the batch can handle. It can be @@ -3073,6 +3077,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) krcp->head = head; } + krcp->count++; + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !krcp->monitor_todo) { @@ -3087,6 +3093,58 @@ unlock_return: } EXPORT_SYMBOL_GPL(kfree_call_rcu); +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long flags, count = 0; + + /* Snapshot count of all CPUs */ + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_irqsave(&krcp->lock, flags); + count += krcp->count; + spin_unlock_irqrestore(&krcp->lock, flags); + } + + return count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + unsigned long flags; + + for_each_online_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krcp->count; + spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed; +} + +static struct shrinker kfree_rcu_shrinker = { + .count_objects = kfree_rcu_shrink_count, + .scan_objects = kfree_rcu_shrink_scan, + .batch = 0, + .seeks = DEFAULT_SEEKS, +}; + void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -4007,6 +4065,8 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } + if (register_shrinker(&kfree_rcu_shrinker)) + pr_err("Failed to register kfree_rcu() shrinker!\n"); } void __init rcu_init(void) -- cgit v1.2.3 From a6a82ce18ba443186545d3fefbee8b9419a859dc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 16 Mar 2020 12:32:28 -0400 Subject: rcu/tree: Count number of batched kfree_rcu() locklessly We can relax the correctness of counting of number of queued objects in favor of not hurting performance, by locklessly sampling per-cpu counters. This should be Ok since under high memory pressure, it should not matter if we are off by a few objects while counting. The shrinker will still do the reclaim. Signed-off-by: Joel Fernandes (Google) [ paulmck: Remove unused "flags" variable. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e299cd0ddd97..3f1f57411fe1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2939,7 +2939,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krcp->head = NULL; } - krcp->count = 0; + WRITE_ONCE(krcp->count, 0); /* * One work is per one batch, so there are two "free channels", @@ -3077,7 +3077,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) krcp->head = head; } - krcp->count++; + WRITE_ONCE(krcp->count, krcp->count + 1); // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && @@ -3097,15 +3097,13 @@ static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { int cpu; - unsigned long flags, count = 0; + unsigned long count = 0; /* Snapshot count of all CPUs */ for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); - count += krcp->count; - spin_unlock_irqrestore(&krcp->lock, flags); + count += READ_ONCE(krcp->count); } return count; -- cgit v1.2.3 From 6be7436d2245d3dd8b9a8f949367c13841c23308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 13:47:41 -0700 Subject: rcu: Add rcu_gp_might_be_stalled() This commit adds rcu_gp_might_be_stalled(), which returns true if there is some reason to believe that the RCU grace period is stalled. The use case is where an RCU free-memory path needs to allocate memory in order to free it, a situation that should be avoided where possible. But where it is necessary, there is always the alternative of using synchronize_rcu() to wait for a grace period in order to avoid the allocation. And if the grace period is stalled, allocating memory to asynchronously wait for it is a bad idea of epic proportions: Far better to let others use the memory, because these others might actually be able to free that memory before the grace period ends. Thus, rcu_gp_might_be_stalled() can be used to help decide whether allocating memory on an RCU free path is a semi-reasonable course of action. Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..4dede00e2936 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } -- cgit v1.2.3 From c76e7e0bce10876e6b08ac2ce8af5ef7cba684ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Apr 2020 10:19:02 -0700 Subject: rcu: Add KCSAN stubs to update.c This commit adds stubs for KCSAN's data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros to allow code using these macros to move ahead. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 28a8bdc5072f..74a698aa9027 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,6 +51,19 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); -- cgit v1.2.3 From e4453d8a1c56050df320ef54f339ffa4a9513d0a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Feb 2020 14:18:09 -0800 Subject: rcu: Make rcu_read_unlock_special() safe for rq/pi locks The scheduler is currently required to hold rq/pi locks across the entire RCU read-side critical section or not at all. This is inconvenient and leaves traps for the unwary, including the author of this commit. But now that excessively long grace periods enable scheduling-clock interrupts for holdout nohz_full CPUs, the nohz_full rescue logic in rcu_read_unlock_special() can be dispensed with. In other words, the rcu_read_unlock_special() function can refrain from doing wakeups unless such wakeups are guaranteed safe. This commit therefore avoids unsafe wakeups, freeing the scheduler to hold rq/pi locks across rcu_read_unlock() even if the corresponding RCU read-side critical section might have been preempted. This commit also updates RCU's requirements documentation. This commit is inspired by a patch from Lai Jiangshan: https://lore.kernel.org/lkml/20191102124559.1135-2-laijs@linux.alibaba.com This commit is further intended to be a step towards his goal of permitting the inlining of RCU-preempt's rcu_read_lock() and rcu_read_unlock(). Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 097635c41135..ccad77639d80 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -615,19 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - tick_nohz_full_cpu(rdp->cpu); + exp = (t->rcu_blocked_node && + READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && - (in_interrupt() || - (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { - // Using softirq, safe to awaken, and we get - // no help from enabling irqs, unlike bh/preempt. + if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + // Using softirq, safe to awaken, and either the + // wakeup is free or there is an expedited GP. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting or NO_HZ_FULL, slow is OK. + // Also if no expediting, slow is OK. + // Plus nohz_full CPUs eventually get tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && -- cgit v1.2.3 From 07b4a930fc44a537efecf73c1fd2b4937f64caaa Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2020 14:37:26 -0800 Subject: rcu: Don't set nesting depth negative in rcu_preempt_deferred_qs() Now that RCU flavors have been consolidated, an RCU-preempt rcu_read_unlock() in an interrupt or softirq handler cannot possibly end the RCU read-side critical section. Consider the old vulnerability involving rcu_preempt_deferred_qs() being invoked within such a handler that interrupted an extended RCU read-side critical section, in which a wakeup might be invoked with a scheduler lock held. Because rcu_read_unlock_special() no longer does wakeups in such situations, it is no longer necessary for rcu_preempt_deferred_qs() to set the nesting level negative. This commit therefore removes this recursion-protection code from rcu_preempt_deferred_qs(). [ paulmck: Fix typo in commit log per Steve Rostedt. ] Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ccad77639d80..263c766b9dc1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -569,16 +569,11 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* -- cgit v1.2.3 From f0bdf6d473cf12a488a78422e15aafdfe77cf853 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2020 14:52:32 -0800 Subject: rcu: Remove unused ->rcu_read_unlock_special.b.deferred_qs field The ->rcu_read_unlock_special.b.deferred_qs field is set to true in rcu_read_unlock_special() but never set to false. This is not particularly useful, so this commit removes this field. The only possible justification for this field is to ease debugging of RCU deferred quiscent states, but the combination of the other ->rcu_read_unlock_special fields plus ->rcu_blocked_node and of course ->rcu_read_lock_nesting should cover debugging needs. And if this last proves incorrect, this patch can always be reverted, along with the required setting of ->rcu_read_unlock_special.b.deferred_qs to false in rcu_preempt_deferred_qs_irqrestore(). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 263c766b9dc1..f31c5992f842 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -634,7 +634,6 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } - t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } -- cgit v1.2.3 From 5f5fa7ea89dc82d34ed458f4d7a8634e8e9eefce Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2020 15:23:26 -0800 Subject: rcu: Don't use negative nesting depth in __rcu_read_unlock() Now that RCU flavors have been consolidated, an RCU-preempt rcu_read_unlock() in an interrupt or softirq handler cannot possibly end the RCU read-side critical section. Consider the old vulnerability involving rcu_read_unlock() being invoked within such a handler that interrupted an __rcu_read_unlock_special(), in which a wakeup might be invoked with a scheduler lock held. Because rcu_read_unlock_special() no longer does wakeups in such situations, it is no longer necessary for __rcu_read_unlock() to set the nesting level negative. This commit therefore removes this recursion-protection code from __rcu_read_unlock(). [ paulmck: Let rcu_exp_handler() continue to call rcu_report_exp_rdp(). ] [ paulmck: Adjust other checks given no more negative nesting. ] Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 31 +++++-------------------------- kernel/rcu/tree_plugin.h | 22 +++++++--------------- 2 files changed, 12 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..0e5ccb330f9d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -639,6 +639,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) */ static void rcu_exp_handler(void *unused) { + int depth = rcu_preempt_depth(); unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -649,7 +650,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!rcu_preempt_depth()) { + if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -673,7 +674,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (rcu_preempt_depth() > 0) { + if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; @@ -683,30 +684,8 @@ static void rcu_exp_handler(void *unused) return; } - /* - * The final and least likely case is where the interrupted - * code was just about to or just finished exiting the RCU-preempt - * read-side critical section, and no, we can't tell which. - * So either way, set ->deferred_qs to flag later code that - * a quiescent state is required. - * - * If the CPU is fully enabled (or if some buggy RCU-preempt - * read-side critical section is being used from idle), just - * invoke rcu_preempt_deferred_qs() to immediately report the - * quiescent state. We cannot use rcu_read_unlock_special() - * because we are in an interrupt handler, which will cause that - * function to take an early exit without doing anything. - * - * Otherwise, force a context switch after the CPU enables everything. - */ - rdp->exp_deferred_qs = true; - if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { - rcu_preempt_deferred_qs(t); - } else { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + // Finally, negative nesting depth should not happen. + WARN_ON_ONCE(1); } /* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f31c5992f842..088e84e4578f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -345,9 +345,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return READ_ONCE(rnp->gp_tasks) != NULL; } -/* Bias and limit values for ->rcu_read_lock_nesting. */ -#define RCU_NEST_BIAS INT_MAX -#define RCU_NEST_NMAX (-INT_MAX / 2) +/* limit value for ->rcu_read_lock_nesting. */ #define RCU_NEST_PMAX (INT_MAX / 2) static void rcu_preempt_read_enter(void) @@ -355,9 +353,9 @@ static void rcu_preempt_read_enter(void) current->rcu_read_lock_nesting++; } -static void rcu_preempt_read_exit(void) +static int rcu_preempt_read_exit(void) { - current->rcu_read_lock_nesting--; + return --current->rcu_read_lock_nesting; } static void rcu_preempt_depth_set(int val) @@ -390,21 +388,15 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (rcu_preempt_depth() != 1) { - rcu_preempt_read_exit(); - } else { + if (rcu_preempt_read_exit() == 0) { barrier(); /* critical section before exit code. */ - rcu_preempt_depth_set(-RCU_NEST_BIAS); - barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { int rrln = rcu_preempt_depth(); - WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); + WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); } } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -556,7 +548,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - rcu_preempt_depth() <= 0; + rcu_preempt_depth() == 0; } /* @@ -692,7 +684,7 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!rcu_preempt_depth()) { + } else if (!WARN_ON_ONCE(rcu_preempt_depth())) { rcu_qs(); /* Report immediate QS. */ return; } -- cgit v1.2.3 From 52b1fc3f798d02a3a9d1cf7a84e98a795223410a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 28 Mar 2020 18:53:25 -0700 Subject: rcutorture: Add test of holding scheduler locks across rcu_read_unlock() Now that it should be safe to hold scheduler locks across rcu_read_unlock(), even in cases where the corresponding RCU read-side critical section might have been preempted and boosted, the commit adds a test of this capability to rcutorture. This has been tested on current mainline (which can deadlock in this situation), and lockdep duly reported the expected deadlock. On -rcu, lockdep is silent, thus far, anyway. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5453bd557f43..b348cf816d89 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1147,6 +1147,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + unsigned long flags; int idxnew = -1; int idxold = *readstate; int statesnew = ~*readstate & newstate; @@ -1181,8 +1182,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_SCHED) rcu_read_unlock_sched(); - if (statesold & RCUTORTURE_RDR_RCU) + if (statesold & RCUTORTURE_RDR_RCU) { + bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + + if (lockit) + raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + if (lockit) + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) -- cgit v1.2.3 From ac3caf827488d3bc4d4101ff34931abbfa33839d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Mar 2020 17:01:57 -0700 Subject: rcu: Add comments marking transitions between RCU watching and not It is not as clear as it might be just where in RCU's idle entry/exit code RCU stops and starts watching the current CPU. This commit therefore adds comments calling out the transitions. Reported-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 156ac8d0418b..0bbcbf398169 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -238,7 +238,9 @@ void rcu_softirq_qs(void) /* * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state. + * called when not already in an extended quiescent state, that is, + * RCU is watching prior to the call to this function and is no longer + * watching upon return. */ static void rcu_dynticks_eqs_enter(void) { @@ -251,7 +253,7 @@ static void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); - /* Better be in an extended quiescent state! */ + // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); /* Better not have special action (TLB flush) pending! */ @@ -261,7 +263,8 @@ static void rcu_dynticks_eqs_enter(void) /* * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state. + * called from an extended quiescent state, that is, RCU is not watching + * prior to the call to this function and is watching upon return. */ static void rcu_dynticks_eqs_exit(void) { @@ -274,6 +277,7 @@ static void rcu_dynticks_eqs_exit(void) * critical section. */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + // RCU is now watching. Better not be in an extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { @@ -584,6 +588,7 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdp->dynticks_nesting == 0); if (rdp->dynticks_nesting != 1) { + // RCU will still be watching, so just do accounting and leave. rdp->dynticks_nesting--; return; } @@ -596,7 +601,9 @@ static void rcu_eqs_enter(bool user) rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. rcu_dynticks_task_enter(); } @@ -676,7 +683,9 @@ static __always_inline void rcu_nmi_exit_common(bool irq) if (irq) rcu_prepare_for_idle(); + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. if (irq) rcu_dynticks_task_enter(); @@ -751,11 +760,14 @@ static void rcu_eqs_exit(bool user) oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { + // RCU was already watching, so just do accounting and leave. rdp->dynticks_nesting++; return; } rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); @@ -832,7 +844,9 @@ static __always_inline void rcu_nmi_enter_common(bool irq) if (irq) rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. if (irq) rcu_cleanup_after_idle(); @@ -842,9 +856,16 @@ static __always_inline void rcu_nmi_enter_common(bool irq) rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !READ_ONCE(rdp->rcu_forced_tick)) { + // We get here only if we had already exited the extended + // quiescent state and this was an interrupt (not an NMI). + // Therefore, (1) RCU is already watching and (2) The fact + // that we are in an interrupt handler and that the rcu_node + // lock is an irq-disabled lock prevents self-deadlock. + // So we can safely recheck under the lock. raw_spin_lock_rcu_node(rdp->mynode); - // Recheck under lock. if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU + // needs a quiescent state. Turn on the tick! WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } -- cgit v1.2.3 From 66777e5821f6e672003fde697b8489402bb5aa98 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 15:22:44 -0700 Subject: rcu-tasks: Use context-switch hook for PREEMPT=y kernels Currently, the PREEMPT=y version of rcu_note_context_switch() does not invoke rcu_tasks_qs(), and we need it to in order to keep RCU Tasks Trace's IPIs down to a dull roar. This commit therefore enables this hook. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 088e84e4578f..4f34c325dd90 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -331,6 +331,8 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); + if (!preempt) + rcu_tasks_qs(current); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -- cgit v1.2.3 From 2beaf3280e57bb891f8012dca49c87ed0f01e2f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Mar 2020 14:23:21 -0700 Subject: sched/core: Add function to sample state of locked-down task A running task's state can be sampled in a consistent manner (for example, for diagnostic purposes) simply by invoking smp_call_function_single() on its CPU, which may be obtained using task_cpu(), then having the IPI handler verify that the desired task is in fact still running. However, if the task is not running, this sampling can in theory be done immediately and directly. In practice, the task might start running at any time, including during the sampling period. Gaining a consistent sample of a not-running task therefore requires that something be done to lock down the target task's state. This commit therefore adds a try_invoke_on_locked_down_task() function that invokes a specified function if the specified task can be locked down, returning true if successful and if the specified function returns true. Otherwise this function simply returns false. Given that the function passed to try_invoke_on_nonrunning_task() might be invoked with a runqueue lock held, that function had better be quite lightweight. The function is passed the target task's task_struct pointer and the argument passed to try_invoke_on_locked_down_task(), allowing easy access to task state and to a location for further variables to be passed in and out. Note that the specified function will be called even if the specified task is currently running. The function can use ->on_rq and task_curr() to quickly and easily determine the task's state, and can return false if this state is not to the function's liking. The caller of the try_invoke_on_locked_down_task() would then see the false return value, and could take appropriate action, for example, trying again later or sending an IPI if matters are more urgent. It is expected that use cases such as the RCU CPU stall warning code will simply return false if the task is currently running. However, there are use cases involving nohz_full CPUs where the specified function might instead fall back to an alternative sampling scheme that relies on heavier synchronization (such as memory barriers) in the target task. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Ben Segall Cc: Mel Gorman [ paulmck: Apply feedback from Peter Zijlstra and Steven Rostedt. ] [ paulmck: Invoke if running to handle feedback from Mathieu Desnoyers. ] Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5ca567adfcb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2566,6 +2566,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2639,6 +2641,52 @@ out: return success; } +/** + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked. + * @func: Function to invoke. + * @arg: Argument to function. + * + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. + */ +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +{ + bool ret = false; + struct rq_flags rf; + struct rq *rq; + + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq(&p->pi_lock); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); + } + } + raw_spin_unlock_irq(&p->pi_lock); + return ret; +} + /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. -- cgit v1.2.3 From 5bef8da66a9c45d3c371d74463894f1fc31dfdda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Mar 2020 17:35:46 -0700 Subject: rcu: Add per-task state to RCU CPU stall warnings Currently, an RCU-preempt CPU stall warning simply lists the PIDs of those tasks holding up the current grace period. This can be helpful, but more can be even more helpful. To this end, this commit adds the nesting level, whether the task thinks it was preempted in its current RCU read-side critical section, whether RCU core has asked this task for a quiescent state, whether the expedited-grace-period hint is set, and whether the task believes that it is on the blocked-tasks list (it must be, or it would not be printed, but if things are broken, best not to take too much for granted). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..c65c9759e038 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -192,14 +192,40 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +// Communicate task state back to the RCU CPU stall warning request. +struct rcu_stall_chk_rdr { + int nesting; + union rcu_special rs; + bool on_blkd_list; +}; + +/* + * Report out the state of a not-running task that is stalling the + * current RCU grace period. + */ +static bool check_slow_task(struct task_struct *t, void *arg) +{ + struct rcu_node *rnp; + struct rcu_stall_chk_rdr *rscrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + rscrp->nesting = t->rcu_read_lock_nesting; + rscrp->rs = t->rcu_read_unlock_special; + rnp = t->rcu_blocked_node; + rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); + return true; +} + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ static int rcu_print_task_stall(struct rcu_node *rnp) { - struct task_struct *t; int ndetected = 0; + struct rcu_stall_chk_rdr rscr; + struct task_struct *t; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -208,7 +234,15 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); + if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) + pr_cont(" P%d", t->pid); + else + pr_cont(" P%d/%d:%c%c%c%c", + t->pid, rscr.nesting, + ".b"[rscr.rs.b.blocked], + ".q"[rscr.rs.b.need_qs], + ".e"[rscr.rs.b.exp_hint], + ".l"[rscr.on_blkd_list]); ndetected++; } pr_cont("\n"); -- cgit v1.2.3 From eacd6f04a1333187dd3e96e5635c0edce0a2e354 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Mar 2020 11:59:20 -0800 Subject: rcu-tasks: Move Tasks RCU to its own file This code-movement-only commit is in preparation for adding an additional flavor of Tasks RCU, which relies on workqueues to detect grace periods. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 370 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/update.c | 366 +-------------------------------------------------- 2 files changed, 372 insertions(+), 364 deletions(-) create mode 100644 kernel/rcu/tasks.h (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h new file mode 100644 index 000000000000..be8d179a4ca9 --- /dev/null +++ b/kernel/rcu/tasks.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Task-based RCU implementations. + * + * Copyright (C) 2020 Paul E. McKenney + */ + +#ifdef CONFIG_TASKS_RCU + +/* + * Simple variant of RCU whose quiescent states are voluntary context + * switch, cond_resched_rcu_qs(), user-space execution, and idle. + * As such, grace periods can take one good long time. There are no + * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() + * because this implementation is intended to get the system into a safe + * state for some of the manipulations involved in tracing and the like. + * Finally, this implementation does not support high call_rcu_tasks() + * rates from multiple CPUs. If this is required, per-CPU callback lists + * will be needed. + */ + +/* Global list of callbacks and associated lock. */ +static struct rcu_head *rcu_tasks_cbs_head; +static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; +static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); +static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); + +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; +module_param(rcu_task_stall_timeout, int, 0644); + +static struct task_struct *rcu_tasks_kthread_ptr; + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + unsigned long flags; + bool needwake; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + needwake = !rcu_tasks_cbs_head; + WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); + rcu_tasks_cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + /* We can't create the thread unless interrupts are enabled. */ + if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) + wake_up(&rcu_tasks_cbs_wq); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu_tasks() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-tasks read-side critical section whose beginning + * preceded the call to synchronize_rcu_tasks(). In addition, each CPU + * having an RCU-tasks read-side critical section that extends beyond + * the return from synchronize_rcu_tasks() is guaranteed to have executed + * a full memory barrier after the beginning of synchronize_rcu_tasks() + * and before the beginning of that RCU-tasks read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU + * (but again only if the system has more than one CPU). + */ +void synchronize_rcu_tasks(void) +{ + /* Complain if the scheduler has not started. */ + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(call_rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct task_struct *g, *t; + unsigned long lastreport; + struct rcu_head *list; + struct rcu_head *next; + LIST_HEAD(rcu_tasks_holdouts); + int fract; + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current, HK_FLAG_RCU); + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + list = rcu_tasks_cbs_head; + rcu_tasks_cbs_head = NULL; + rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + wait_event_interruptible(rcu_tasks_cbs_wq, + READ_ONCE(rcu_tasks_cbs_head)); + if (!rcu_tasks_cbs_head) { + WARN_ON(signal_pending(current)); + schedule_timeout_interruptible(HZ/10); + } + continue; + } + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw + * transitions to complete. Invoking synchronize_rcu() + * suffices because all these transitions occur with + * interrupts disabled. Without this synchronize_rcu(), + * a read-side critical section that started before the + * grace period might be incorrectly seen as having started + * after the grace period. + * + * This synchronize_rcu() also dispenses with the + * need for a memory barrier on the first store to + * ->rcu_tasks_holdout, as it forces the store to happen + * after the beginning of the grace period. + */ + synchronize_rcu(); + + /* + * There were callbacks, so we need to wait for an + * RCU-tasks grace period. Start off by scanning + * the task list for tasks that are not already + * voluntarily blocked. Mark these tasks and make + * a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && READ_ONCE(t->on_rq) && + !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Wait for tasks that are in the process of exiting. + * This does only part of the job, ensuring that all + * tasks that were previously exiting reach the point + * where they have disabled preemption, allowing the + * later synchronize_rcu() to finish the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + + /* + * Each pass through the following loop scans the list + * of holdout tasks, removing any that are no longer + * holdouts. When the list is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + struct task_struct *t1; + + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && + time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, &firstreport); + cond_resched(); + } + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed + * to have a full memory barriers prior to them in the + * schedule() path, memory reordering on other CPUs could + * cause their RCU-tasks read-side critical sections to + * extend past the end of the grace period. However, + * because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() + * to force the needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all + * ->rcu_tasks_holdout accesses to be within the grace + * period, avoiding the need for memory barriers for + * ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting + * tasks to complete their final preempt_disable() region + * of execution, cleaning up after the synchronize_srcu() + * above. + */ + synchronize_rcu(); + + /* Invoke the callbacks. */ + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + /* Paranoid sleep to keep this from entering a tight loop */ + schedule_timeout_uninterruptible(HZ/10); + } +} + +/* Spawn rcu_tasks_kthread() at core_initcall() time. */ +static int __init rcu_spawn_tasks_kthread(void) +{ + struct task_struct *t; + + t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) + return 0; + smp_mb(); /* Ensure others see full kthread. */ + WRITE_ONCE(rcu_tasks_kthread_ptr, t); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + preempt_enable(); +} + +#endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#ifdef CONFIG_TASKS_RCU + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + else + pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 74a698aa9027..c5799349ff31 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -514,370 +514,6 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); -#ifdef CONFIG_TASKS_RCU - -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); - -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); - -/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) -static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; -module_param(rcu_task_stall_timeout, int, 0644); - -static struct task_struct *rcu_tasks_kthread_ptr; - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) -{ - unsigned long flags; - bool needwake; - - rhp->next = NULL; - rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks); - -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); -} - -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) -{ - unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; - struct rcu_head *list; - struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); - int fract; - - /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); - - /* - * Each pass through the following loop makes one check for - * newly arrived callbacks, and, if there are some, waits for - * one RCU-tasks grace period and then invokes the callbacks. - * This loop is terminated by the system going down. ;-) - */ - for (;;) { - - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - READ_ONCE(rcu_tasks_cbs_head)); - if (!rcu_tasks_cbs_head) { - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(HZ/10); - } - continue; - } - - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); - - /* Invoke the callbacks. */ - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } - /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); - } -} - -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) -{ - struct task_struct *t; - - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; - smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); - return 0; -} -core_initcall(rcu_spawn_tasks_kthread); - -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); - preempt_enable(); -} - -#endif /* #ifdef CONFIG_TASKS_RCU */ - -#ifndef CONFIG_TINY_RCU - -/* - * Print any non-default Tasks RCU settings. - */ -static void __init rcu_tasks_bootup_oddness(void) -{ -#ifdef CONFIG_TASKS_RCU - if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) - pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); -#endif /* #ifdef CONFIG_TASKS_RCU */ -} - -#endif /* #ifndef CONFIG_TINY_RCU */ - #ifdef CONFIG_PROVE_RCU /* @@ -948,6 +584,8 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#include "tasks.h" + #ifndef CONFIG_TINY_RCU /* -- cgit v1.2.3 From 07e105158d97b4969891e844f318d16f6cef566c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Mar 2020 15:16:57 -0800 Subject: rcu-tasks: Create struct to hold state information This commit creates an rcu_tasks struct to hold state information for RCU Tasks. This is a preparation commit for adding additional flavors of Tasks RCU, each of which would have its own rcu_tasks struct. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 73 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index be8d179a4ca9..5ccfe0d64e6a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -7,6 +7,30 @@ #ifdef CONFIG_TASKS_RCU +/** + * Definition for a Tasks-RCU-like mechanism. + * @cbs_head: Head of callback list. + * @cbs_tail: Tail pointer for callback list. + * @cbs_wq: Wait queue allowning new callback to get kthread's attention. + * @cbs_lock: Lock protecting callback list. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + */ +struct rcu_tasks { + struct rcu_head *cbs_head; + struct rcu_head **cbs_tail; + struct wait_queue_head cbs_wq; + raw_spinlock_t cbs_lock; + struct task_struct *kthread_ptr; +}; + +#define DEFINE_RCU_TASKS(name) \ +static struct rcu_tasks name = \ +{ \ + .cbs_tail = &name.cbs_head, \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(name.cbs_wq), \ + .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(name.cbs_lock), \ +} + /* * Simple variant of RCU whose quiescent states are voluntary context * switch, cond_resched_rcu_qs(), user-space execution, and idle. @@ -18,12 +42,7 @@ * rates from multiple CPUs. If this is required, per-CPU callback lists * will be needed. */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); +DEFINE_RCU_TASKS(rcu_tasks); /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); @@ -33,8 +52,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -static struct task_struct *rcu_tasks_kthread_ptr; - /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period * @rhp: structure to be used for queueing the RCU updates. @@ -57,17 +74,18 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; + struct rcu_tasks *rtp = &rcu_tasks; rhp->next = NULL; rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + needwake = !rtp->cbs_head; + WRITE_ONCE(*rtp->cbs_tail, rhp); + rtp->cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); + if (needwake && READ_ONCE(rtp->kthread_ptr)) + wake_up(&rtp->cbs_wq); } EXPORT_SYMBOL_GPL(call_rcu_tasks); @@ -169,10 +187,12 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *list; struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); + struct rcu_tasks *rtp = arg; int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); + WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! /* * Each pass through the following loop makes one check for @@ -183,17 +203,17 @@ static int __noreturn rcu_tasks_kthread(void *arg) for (;;) { /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + list = rtp->cbs_head; + rtp->cbs_head = NULL; + rtp->cbs_tail = &rtp->cbs_head; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); /* If there were none, wait a bit and start over. */ if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - READ_ONCE(rcu_tasks_cbs_head)); - if (!rcu_tasks_cbs_head) { + wait_event_interruptible(rtp->cbs_wq, + READ_ONCE(rtp->cbs_head)); + if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); schedule_timeout_interruptible(HZ/10); } @@ -211,7 +231,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) * * This synchronize_rcu() also dispenses with the * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen + * t->rcu_tasks_holdout, as it forces the store to happen * after the beginning of the grace period. */ synchronize_rcu(); @@ -278,7 +298,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) firstreport = true; WARN_ON(signal_pending(current)); list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { + rcu_tasks_holdout_list) { check_holdout_task(t, needreport, &firstreport); cond_resched(); } @@ -325,11 +345,10 @@ static int __init rcu_spawn_tasks_kthread(void) { struct task_struct *t; - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); + t = kthread_run(rcu_tasks_kthread, &rcu_tasks, "rcu_tasks_kthread"); if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) return 0; smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); return 0; } core_initcall(rcu_spawn_tasks_kthread); -- cgit v1.2.3 From 9cf8fc6fabd46d7f4729529f88d627ce85c6e970 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Mar 2020 14:00:46 -0800 Subject: rcutorture: Add a test for synchronize_rcu_mult() This commit adds a crude test for synchronize_rcu_mult(). This is currently a smoke test rather than a high-quality stress test. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b348cf816d89..fbb3e6247443 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -665,6 +665,11 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); } +static void synchronize_rcu_mult_test(void) +{ + synchronize_rcu_mult(call_rcu_tasks, call_rcu); +} + static struct rcu_torture_ops tasks_ops = { .ttype = RCU_TASKS_FLAVOR, .init = rcu_sync_torture_init, @@ -674,7 +679,7 @@ static struct rcu_torture_ops tasks_ops = { .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .fqs = NULL, -- cgit v1.2.3 From 5873b8a94e5dae04b8e11fc798df512614e6d1e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2020 11:49:21 -0800 Subject: rcu-tasks: Refactor RCU-tasks to allow variants to be added This commit splits out generic processing from RCU-tasks-specific processing in order to allow additional flavors to be added. It also adds a def_bool TASKS_RCU_GENERIC to enable the common RCU-tasks infrastructure code. This is primarily, but not entirely, a code-movement commit. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 10 +- kernel/rcu/tasks.h | 491 +++++++++++++++++++++++++++------------------------- kernel/rcu/update.c | 4 + 3 files changed, 269 insertions(+), 236 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1cc940fef17c..38475d0bc634 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -70,13 +70,19 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config TASKS_RCU_GENERIC + def_bool TASKS_RCU + select SRCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + config TASKS_RCU def_bool PREEMPTION - select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + user-mode execution as quiescent states. Not for manual selection. config RCU_STALL_COMMON def_bool TREE_RCU diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5ccfe0d64e6a..d77921ee5a6e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -5,7 +5,13 @@ * Copyright (C) 2020 Paul E. McKenney */ -#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); /** * Definition for a Tasks-RCU-like mechanism. @@ -14,6 +20,8 @@ * @cbs_wq: Wait queue allowning new callback to get kthread's attention. * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @gp_func: This flavor's grace-period-wait function. + * @call_func: This flavor's call_rcu()-equivalent function. */ struct rcu_tasks { struct rcu_head *cbs_head; @@ -21,29 +29,20 @@ struct rcu_tasks { struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; struct task_struct *kthread_ptr; + rcu_tasks_gp_func_t gp_func; + call_rcu_func_t call_func; }; -#define DEFINE_RCU_TASKS(name) \ +#define DEFINE_RCU_TASKS(name, gp, call) \ static struct rcu_tasks name = \ { \ .cbs_tail = &name.cbs_head, \ .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(name.cbs_wq), \ .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(name.cbs_lock), \ + .gp_func = gp, \ + .call_func = call, \ } -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ -DEFINE_RCU_TASKS(rcu_tasks); - /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); @@ -52,29 +51,16 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) { unsigned long flags; bool needwake; - struct rcu_tasks *rtp = &rcu_tasks; rhp->next = NULL; rhp->func = func; @@ -87,108 +73,25 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) if (needwake && READ_ONCE(rtp->kthread_ptr)) wake_up(&rtp->cbs_wq); } -EXPORT_SYMBOL_GPL(call_rcu_tasks); -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); + wait_rcu_gp(rtp->call_func); } /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ static int __noreturn rcu_tasks_kthread(void *arg) { unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; struct rcu_head *list; struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); struct rcu_tasks *rtp = arg; - int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); @@ -220,111 +123,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) continue; } - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * t->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); + // Wait for one grace period. + rtp->gp_func(rtp); /* Invoke the callbacks. */ while (list) { @@ -340,18 +140,16 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) +/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; - t = kthread_run(rcu_tasks_kthread, &rcu_tasks, "rcu_tasks_kthread"); + t = kthread_run(rcu_tasks_kthread, rtp, "rcu_tasks_kthread"); if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; + return; smp_mb(); /* Ensure others see full kthread. */ - return 0; } -core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -369,8 +167,6 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) preempt_enable(); } -#endif /* #ifdef CONFIG_TASKS_RCU */ - #ifndef CONFIG_TINY_RCU /* @@ -387,3 +183,230 @@ static void __init rcu_tasks_bootup_oddness(void) } #endif /* #ifndef CONFIG_TINY_RCU */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(rcu_tasks_holdouts); + int fract; + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + struct task_struct *t1; + + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, &firstreport); + cond_resched(); + } + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks); + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +#endif /* #ifdef CONFIG_TASKS_RCU */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c5799349ff31..30dce20e1644 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -584,7 +584,11 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#ifdef CONFIG_TASKS_RCU_GENERIC #include "tasks.h" +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From c84aad765406c4c7573ce449e8a9977ebb8f4cb9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Mar 2020 21:06:43 -0800 Subject: rcu-tasks: Add an RCU-tasks rude variant This commit adds a "rude" variant of RCU-tasks that has as quiescent states schedule(), cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, anyway) cond_resched(). In other words, RCU-tasks rude readers are regions of code with preemption disabled, but excluding code early in the CPU-online sequence and late in the CPU-offline sequence. Updates make use of IPIs and force an IPI and a context switch on each online CPU. This variant is useful in some situations in tracing. Suggested-by: Steven Rostedt [ paulmck: Apply EXPORT_SYMBOL_GPL() feedback from Qiujun Huang. ] Signed-off-by: Paul E. McKenney [ paulmck: Apply review feedback from Steve Rostedt. ] --- kernel/rcu/Kconfig | 11 +++++- kernel/rcu/tasks.h | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 38475d0bc634..6ee6372a4459 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -84,6 +84,15 @@ config TASKS_RCU only voluntary context switch (not preemption!), idle, and user-mode execution as quiescent states. Not for manual selection. +config TASKS_RUDE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + only context switch (including preemption) and user-mode + execution as quiescent states. It forces IPIs and context + switches on all online CPUs, including idle ones, so use + with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d77921ee5a6e..7f9ed20c26c7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -180,6 +180,9 @@ static void __init rcu_tasks_bootup_oddness(void) else pr_info("\tTasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -410,3 +413,98 @@ static int __init rcu_spawn_tasks_kthread(void) core_initcall(rcu_spawn_tasks_kthread); #endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of +// passing an empty function to schedule_on_each_cpu(). This approach +// provides an asynchronous call_rcu_tasks_rude() API and batching +// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// This sends IPIs far and wide and induces otherwise unnecessary context +// switches on all online CPUs, whether idle or not. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude); + +/** + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +/** + * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_rude(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_rude(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} +core_initcall(rcu_spawn_tasks_rude_kthread); + +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ -- cgit v1.2.3 From 3d6e43c75d6bab212e8bc142585ee36eb8e2e5d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2020 15:02:50 -0800 Subject: rcutorture: Add torture tests for RCU Tasks Rude This commit adds the definitions required to torture the rude flavor of RCU tasks. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 ++ kernel/rcu/rcu.h | 1 + kernel/rcu/rcutorture.c | 31 +++++++++++++++++++++++++++++-- 3 files changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4aa02eee8f6c..b5f3545b0cfb 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -29,6 +29,7 @@ config RCU_PERF_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU default n help This option provides a kernel module that runs performance @@ -46,6 +47,7 @@ config RCU_TORTURE_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 00ddc92c5774..c5746202e124 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -441,6 +441,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TASKS_RUDE_FLAVOR, RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fbb3e6247443..6b0663801a82 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -730,6 +730,33 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +/* + * Definitions for rude RCU-tasks torture testing. + */ + +static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_rude_ops = { + .ttype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_rude_torture_deferred_free, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .call = call_rcu_tasks_rude, + .cb_barrier = rcu_barrier_tasks_rude, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks-rude" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -739,7 +766,7 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) static bool __maybe_unused torturing_tasks(void) { - return cur_ops == &tasks_ops; + return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops; } /* @@ -2413,7 +2440,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &trivial_ops, + &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) -- cgit v1.2.3 From c97d12a63c26fc4521d0904f073f9997ae796cba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2020 15:50:31 -0800 Subject: rcu-tasks: Use unique names for RCU-Tasks kthreads and messages This commit causes the flavors of RCU Tasks to use different names for their kthreads and in their console messages. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7f9ed20c26c7..9ca83c68e486 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -22,6 +22,8 @@ typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @call_func: This flavor's call_rcu()-equivalent function. + * @name: This flavor's textual name. + * @kname: This flavor's kthread name. */ struct rcu_tasks { struct rcu_head *cbs_head; @@ -31,16 +33,20 @@ struct rcu_tasks { struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; call_rcu_func_t call_func; + char *name; + char *kname; }; -#define DEFINE_RCU_TASKS(name, gp, call) \ -static struct rcu_tasks name = \ +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static struct rcu_tasks rt_name = \ { \ - .cbs_tail = &name.cbs_head, \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(name.cbs_wq), \ - .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(name.cbs_lock), \ + .cbs_tail = &rt_name.cbs_head, \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ .gp_func = gp, \ .call_func = call, \ + .name = n, \ + .kname = #rt_name, \ } /* Track exiting tasks in order to allow them to be waited for. */ @@ -145,8 +151,8 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; - t = kthread_run(rcu_tasks_kthread, rtp, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) + t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) return; smp_mb(); /* Ensure others see full kthread. */ } @@ -342,7 +348,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) } void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period @@ -437,7 +443,8 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) } void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, + "RCU Tasks Rude"); /** * call_rcu_tasks_rude() - Queue a callback rude task-based grace period -- cgit v1.2.3 From e4fe5dd6f26f74233e217d9dd351adc3e5165bb9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Mar 2020 17:31:43 -0800 Subject: rcu-tasks: Further refactor RCU-tasks to allow adding more variants This commit refactors RCU tasks to allow variants to be added. These variants will share the current Tasks-RCU tasklist scan and the holdout list processing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 166 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9ca83c68e486..344426e2390d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -12,6 +12,11 @@ struct rcu_tasks; typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); +typedef void (*pregp_func_t)(void); +typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); +typedef void (*postscan_func_t)(void); +typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); +typedef void (*postgp_func_t)(void); /** * Definition for a Tasks-RCU-like mechanism. @@ -21,6 +26,11 @@ typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. + * @pregp_func: This flavor's pre-grace-period function (optional). + * @pertask_func: This flavor's per-task scan function (optional). + * @postscan_func: This flavor's post-task scan function (optional). + * @holdout_func: This flavor's holdout-list scan function (optional). + * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. * @name: This flavor's textual name. * @kname: This flavor's kthread name. @@ -32,6 +42,11 @@ struct rcu_tasks { raw_spinlock_t cbs_lock; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; + pregp_func_t pregp_func; + pertask_func_t pertask_func; + postscan_func_t postscan_func; + holdouts_func_t holdouts_func; + postgp_func_t postgp_func; call_rcu_func_t call_func; char *name; char *kname; @@ -113,6 +128,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Pick up any new callbacks. */ raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + smp_mb__after_unlock_lock(); // Order updates vs. GP. list = rtp->cbs_head; rtp->cbs_head = NULL; rtp->cbs_tail = &rtp->cbs_head; @@ -207,6 +223,49 @@ static void __init rcu_tasks_bootup_oddness(void) // rates from multiple CPUs. If this is required, per-CPU callback lists // will be needed. +/* Pre-grace-period preparation. */ +static void rcu_tasks_pregp_step(void) +{ + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); +} + +/* Per-task initial processing. */ +static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) +{ + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, hop); + } +} + +/* Processing between scanning taskslist and draining the holdout list. */ +void rcu_tasks_postscan(void) +{ + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); +} + /* See if tasks are still holding out, complain if so. */ static void check_holdout_task(struct task_struct *t, bool needreport, bool *firstreport) @@ -239,55 +298,63 @@ static void check_holdout_task(struct task_struct *t, sched_show_task(t); } +/* Scan the holdout lists for tasks no longer holding out. */ +static void check_all_holdout_tasks(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *t, *t1; + + list_for_each_entry_safe(t, t1, hop, rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, firstreport); + cond_resched(); + } +} + +/* Finish off the Tasks-RCU grace period. */ +static void rcu_tasks_postgp(void) +{ + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + /* Wait for one RCU-tasks grace period. */ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) { struct task_struct *g, *t; unsigned long lastreport; - LIST_HEAD(rcu_tasks_holdouts); + LIST_HEAD(holdouts); int fract; - /* - * Wait for all pre-existing t->on_rq and t->nvcsw transitions - * to complete. Invoking synchronize_rcu() suffices because all - * these transitions occur with interrupts disabled. Without this - * synchronize_rcu(), a read-side critical section that started - * before the grace period might be incorrectly seen as having - * started after the grace period. - * - * This synchronize_rcu() also dispenses with the need for a - * memory barrier on the first store to t->rcu_tasks_holdout, - * as it forces the store to happen after the beginning of the - * grace period. - */ - synchronize_rcu(); + rtp->pregp_func(); /* * There were callbacks, so we need to wait for an RCU-tasks * grace period. Start off by scanning the task list for tasks * that are not already voluntarily blocked. Mark these tasks - * and make a list of them in rcu_tasks_holdouts. + * and make a list of them in holdouts. */ rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); rcu_read_unlock(); - /* - * Wait for tasks that are in the process of exiting. This - * does only part of the job, ensuring that all tasks that were - * previously exiting reach the point where they have disabled - * preemption, allowing the later synchronize_rcu() to finish - * the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); + rtp->postscan_func(); /* * Each pass through the following loop scans the list of holdout @@ -303,9 +370,8 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) bool firstreport; bool needreport; int rtst; - struct task_struct *t1; - if (list_empty(&rcu_tasks_holdouts)) + if (list_empty(&holdouts)) break; /* Slowly back off waiting for holdouts */ @@ -320,31 +386,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) lastreport = jiffies; firstreport = true; WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } + rtp->holdouts_func(&holdouts, needreport, &firstreport); } - /* - * Because ->on_rq and ->nvcsw are not guaranteed to have a full - * memory barriers prior to them in the schedule() path, memory - * reordering on other CPUs could cause their RCU-tasks read-side - * critical sections to extend past the end of the grace period. - * However, because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() to force the - * needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all ->rcu_tasks_holdout - * accesses to be within the grace period, avoiding the need for - * memory barriers for ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting tasks - * to complete their final preempt_disable() region of execution, - * cleaning up after the synchronize_srcu() above. - */ - synchronize_rcu(); + rtp->postgp_func(); } void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); @@ -413,6 +458,11 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + rcu_tasks.pregp_func = rcu_tasks_pregp_step; + rcu_tasks.pertask_func = rcu_tasks_pertask; + rcu_tasks.postscan_func = rcu_tasks_postscan; + rcu_tasks.holdouts_func = check_all_holdout_tasks; + rcu_tasks.postgp_func = rcu_tasks_postgp; rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } -- cgit v1.2.3 From d01aa2633b5d5ebc16fa47ad7a5e8e9f00482554 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Mar 2020 17:07:07 -0800 Subject: rcu-tasks: Code movement to allow more Tasks RCU variants This commit does nothing but move rcu_tasks_wait_gp() up to a new section for common code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 122 +++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 344426e2390d..d8b09d5d1db1 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -211,6 +211,69 @@ static void __init rcu_tasks_bootup_oddness(void) #ifdef CONFIG_TASKS_RCU +//////////////////////////////////////////////////////////////////////// +// +// Shared code between task-list-scanning variants of Tasks RCU. + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(holdouts); + int fract; + + rtp->pregp_func(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + + rtp->postscan_func(); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + + if (list_empty(&holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + rtp->holdouts_func(&holdouts, needreport, &firstreport); + } + + rtp->postgp_func(); +} + //////////////////////////////////////////////////////////////////////// // // Simple variant of RCU whose quiescent states are voluntary context @@ -333,65 +396,6 @@ static void rcu_tasks_postgp(void) synchronize_rcu(); } -/* Wait for one RCU-tasks grace period. */ -static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) -{ - struct task_struct *g, *t; - unsigned long lastreport; - LIST_HEAD(holdouts); - int fract; - - rtp->pregp_func(); - - /* - * There were callbacks, so we need to wait for an RCU-tasks - * grace period. Start off by scanning the task list for tasks - * that are not already voluntarily blocked. Mark these tasks - * and make a list of them in holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) - rtp->pertask_func(t, &holdouts); - rcu_read_unlock(); - - rtp->postscan_func(); - - /* - * Each pass through the following loop scans the list of holdout - * tasks, removing any that are no longer holdouts. When the list - * is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - - if (list_empty(&holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - rtp->holdouts_func(&holdouts, needreport, &firstreport); - } - - rtp->postgp_func(); -} - void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); -- cgit v1.2.3 From d5f177d35c24429c87db2567d20563fc16f7e8f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Mar 2020 19:56:53 -0700 Subject: rcu-tasks: Add an RCU Tasks Trace to simplify protection of tracing hooks Because RCU does not watch exception early-entry/late-exit, idle-loop, or CPU-hotplug execution, protection of tracing and BPF operations is needlessly complicated. This commit therefore adds a variant of Tasks RCU that: o Has explicit read-side markers to allow finite grace periods in the face of in-kernel loops for PREEMPT=n builds. These markers are rcu_read_lock_trace() and rcu_read_unlock_trace(). o Protects code in the idle loop, exception entry/exit, and CPU-hotplug code paths. In this respect, RCU-tasks trace is similar to SRCU, but with lighter-weight readers. o Avoids expensive read-side instruction, having overhead similar to that of Preemptible RCU. There are of course downsides: o The grace-period code can send IPIs to CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. This is mitigated by later commits. o It is necessary to scan the full tasklist, much as for Tasks RCU. o There is a single callback queue guarded by a single lock, again, much as for Tasks RCU. However, those early use cases that request multiple grace periods in quick succession are expected to do so from a single task, which makes the single lock almost irrelevant. If needed, multiple callback queues can be provided using any number of schemes. Perhaps most important, this variant of RCU does not affect the vanilla flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace readers can operate from idle, offline, and exception entry/exit in no way enables rcu_preempt and rcu_sched readers to do so. The memory ordering was outlined here: https://lore.kernel.org/lkml/20200319034030.GX3199@paulmck-ThinkPad-P72/ This effort benefited greatly from off-list discussions of BPF requirements with Alexei Starovoitov and Andrii Nakryiko. At least some of the on-list discussions are captured in the Link: tags below. In addition, KCSAN was quite helpful in finding some early bugs. Link: https://lore.kernel.org/lkml/20200219150744.428764577@infradead.org/ Link: https://lore.kernel.org/lkml/87mu8p797b.fsf@nanos.tec.linutronix.de/ Link: https://lore.kernel.org/lkml/20200225221305.605144982@linutronix.de/ Cc: Alexei Starovoitov Cc: Andrii Nakryiko [ paulmck: Apply feedback from Steve Rostedt and Joel Fernandes. ] [ paulmck: Decrement trc_n_readers_need_end upon IPI failure. ] [ paulmck: Fix locking issue reported by rcutorture. ] Signed-off-by: Paul E. McKenney --- kernel/fork.c | 4 + kernel/rcu/Kconfig | 11 +- kernel/rcu/tasks.h | 361 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 371 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..72e9396235b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1683,6 +1683,10 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 6ee6372a4459..cb1d18ef343c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU || TASKS_RUDE_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -93,6 +93,15 @@ config TASKS_RUDE_RCU switches on all online CPUs, including idle ones, so use with caution. +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d8b09d5d1db1..fd34fd673a8c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -181,12 +181,17 @@ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) preempt_enable(); } +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) { + struct task_struct *t = current; + preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); preempt_enable(); + exit_tasks_rcu_finish_trace(t); } #ifndef CONFIG_TINY_RCU @@ -196,15 +201,19 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) */ static void __init rcu_tasks_bootup_oddness(void) { -#ifdef CONFIG_TASKS_RCU +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU pr_info("\tRude variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -569,3 +578,347 @@ static int __init rcu_spawn_tasks_rude_kthread(void) core_initcall(rcu_spawn_tasks_rude_kthread); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +atomic_t trc_n_readers_need_end; // Number of waited-for readers. +DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + if (atomic_dec_and_test(&trc_n_readers_need_end)) + wake_up(&trc_wait); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + // If the task is in a read-side critical section, set up its + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + if (unlikely(t->trc_reader_nesting)) { + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + } + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Wait for CPU-hotplug paths to complete. + cpus_read_lock(); + cpus_read_unlock(); + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + t->trc_reader_checked = false; + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* Do intermediate processing between task and holdout scans. */ +static void rcu_tasks_trace_postscan(void) +{ + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool ndrpt, bool *frptp) +{ + struct task_struct *g, *t; + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(void) +{ + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + wait_event_idle_exclusive(trc_wait, + atomic_read(&trc_n_readers_need_end) == 0); + + smp_mb(); // Caller's code must be ordered after wakeup. +} + +/* Report any needed quiescent state for this exiting task. */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) + rcu_read_unlock_trace_special(t); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ -- cgit v1.2.3 From c1a76c0b6abac4e7eb49b5c24a0829f47b70769d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Mar 2020 10:32:30 -0700 Subject: rcutorture: Add torture tests for RCU Tasks Trace This commit adds the definitions required to torture the tracing flavor of RCU tasks. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 ++ kernel/rcu/rcu.h | 1 + kernel/rcu/rcutorture.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index b5f3545b0cfb..452feae8de20 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -30,6 +30,7 @@ config RCU_PERF_TEST select SRCU select TASKS_RCU select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -48,6 +49,7 @@ config RCU_TORTURE_TEST select SRCU select TASKS_RCU select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index c5746202e124..72903867833e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -442,6 +442,7 @@ enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, RCU_TASKS_RUDE_FLAVOR, + RCU_TASKS_TRACING_FLAVOR, RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6b0663801a82..0bec9254959b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "rcu.h" @@ -757,6 +758,45 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +/* + * Definitions for tracing RCU-tasks torture testing. + */ + +static int tasks_tracing_torture_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_tracing_torture_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static void rcu_tasks_tracing_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_trace(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_tracing_ops = { + .ttype = RCU_TASKS_TRACING_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_tracing_torture_read_lock, + .read_delay = srcu_read_delay, /* just reuse srcu's version. */ + .readunlock = tasks_tracing_torture_read_unlock, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_tracing_torture_deferred_free, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .call = call_rcu_tasks_trace, + .cb_barrier = rcu_barrier_tasks_trace, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .slow_gps = 1, + .name = "tasks-tracing" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1323,6 +1363,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(srcu_ctlp) || + rcu_read_lock_trace_held() || torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ @@ -2440,7 +2481,8 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, &trivial_ops, + &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, + &tasks_tracing_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) -- cgit v1.2.3 From 4593e772b5020e714e18f6e212d70b24fbe88b79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Mar 2020 12:13:53 -0700 Subject: rcu-tasks: Add stall warnings for RCU Tasks Trace This commit adds RCU CPU stall warnings for RCU Tasks Trace. These dump out any tasks blocking the current grace period, as well as any CPUs that have not responded to an IPI request. This happens in two phases, when initially extracting state from the tasks and later when waiting for any holdout tasks to check in. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fd34fd673a8c..4237881e7780 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -798,9 +798,41 @@ static void rcu_tasks_trace_postscan(void) // Any tasks that exit after this point will set ->trc_reader_checked. } +/* Show the state of a task stalling the current RCU tasks trace GP. */ +static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) +{ + int cpu; + + if (*firstreport) { + pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); + *firstreport = false; + } + // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). + cpu = task_cpu(t); + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], + ".i"[is_idle_task(t)], + ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], + t->trc_reader_nesting, + " N"[!!t->trc_reader_need_end], + cpu); + sched_show_task(t); +} + +/* List stalled IPIs for RCU tasks trace. */ +static void show_stalled_ipi_trace(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + if (per_cpu(trc_ipi_to_cpu, cpu)) + pr_alert("\tIPI outstanding to CPU %d\n", cpu); +} + /* Do one scan of the holdout list. */ static void check_all_holdout_tasks_trace(struct list_head *hop, - bool ndrpt, bool *frptp) + bool needreport, bool *firstreport) { struct task_struct *g, *t; @@ -813,21 +845,51 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, // If check succeeded, remove this task from the list. if (READ_ONCE(t->trc_reader_checked)) trc_del_holdout(t); + else if (needreport) + show_stalled_task_trace(t, firstreport); + } + if (needreport) { + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); + show_stalled_ipi_trace(); } } /* Wait for grace period to complete and provide ordering. */ static void rcu_tasks_trace_postgp(void) { + bool firstreport; + struct task_struct *g, *t; + LIST_HEAD(holdouts); + long ret; + // Remove the safety count. smp_mb__before_atomic(); // Order vs. earlier atomics atomic_dec(&trc_n_readers_need_end); smp_mb__after_atomic(); // Order vs. later atomics // Wait for readers. - wait_event_idle_exclusive(trc_wait, - atomic_read(&trc_n_readers_need_end) == 0); - + for (;;) { + ret = wait_event_idle_exclusive_timeout( + trc_wait, + atomic_read(&trc_n_readers_need_end) == 0, + READ_ONCE(rcu_task_stall_timeout)); + if (ret) + break; // Count reached zero. + for_each_process_thread(g, t) + if (READ_ONCE(t->trc_reader_need_end)) + trc_add_holdout(t, &holdouts); + firstreport = true; + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) + if (READ_ONCE(t->trc_reader_need_end)) { + show_stalled_task_trace(t, &firstreport); + trc_del_holdout(t); + } + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); + show_stalled_ipi_trace(); + pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); + } smp_mb(); // Caller's code must be ordered after wakeup. } -- cgit v1.2.3 From 8fd8ca388ccf233b8ae0b6b42ddc7caa5034ae85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Mar 2020 14:51:20 -0700 Subject: rcu-tasks: Move #ifdef into tasks.h This commit pushes the #ifdef CONFIG_TASKS_RCU_GENERIC from kernel/rcu/update.c to kernel/rcu/tasks.h in order to improve readability as more APIs are added. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++++ kernel/rcu/update.c | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4237881e7780..b7e3e1d28a5c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -5,6 +5,7 @@ * Copyright (C) 2020 Paul E. McKenney */ +#ifdef CONFIG_TASKS_RCU_GENERIC //////////////////////////////////////////////////////////////////////// // @@ -984,3 +985,7 @@ core_initcall(rcu_spawn_tasks_trace_kthread); #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ + +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 30dce20e1644..c5799349ff31 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -584,11 +584,7 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ -#ifdef CONFIG_TASKS_RCU_GENERIC #include "tasks.h" -#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ -static inline void rcu_tasks_bootup_oddness(void) {} -#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From e21408ceec2de5be418efa39feb1e2c00f824a72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 11:01:55 -0700 Subject: rcu-tasks: Add RCU tasks to rcutorture writer stall output This commit adds state for each RCU-tasks flavor to the rcutorture writer stall output. The initial state is minimal, but you have to start somewhere. Signed-off-by: Paul E. McKenney [ paulmck: Fixes based on feedback from kbuild test robot. ] --- kernel/rcu/rcu.h | 1 + kernel/rcu/tasks.h | 45 +++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree_stall.h | 2 +- 3 files changed, 45 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 72903867833e..e1089fdf8626 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -431,6 +431,7 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void show_rcu_tasks_gp_kthreads(void); void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b7e3e1d28a5c..e4f89425d2c4 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -219,6 +219,16 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifndef CONFIG_TINY_RCU */ +/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ +static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) +{ + pr_info("%s %c%c %s\n", + rtp->kname, + ".k"[!!data_race(rtp->kthread_ptr)], + ".C"[!!data_race(rtp->cbs_head)], + s); +} + #ifdef CONFIG_TASKS_RCU //////////////////////////////////////////////////////////////////////// @@ -482,7 +492,14 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); -#endif /* #ifdef CONFIG_TASKS_RCU */ +static void show_rcu_tasks_classic_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); +} + +#else /* #ifdef CONFIG_TASKS_RCU */ +static void show_rcu_tasks_classic_gp_kthread(void) { } +#endif /* #else #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU @@ -578,7 +595,14 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); -#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +static void show_rcu_tasks_rude_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); +} + +#else /* #ifdef CONFIG_TASKS_RUDE_RCU */ +static void show_rcu_tasks_rude_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -982,10 +1006,27 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); +static void show_rcu_tasks_trace_gp_kthread(void) +{ + char buf[32]; + + sprintf(buf, "N%d", atomic_read(&trc_n_readers_need_end)); + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); +} + #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ +void show_rcu_tasks_gp_kthreads(void) +{ + show_rcu_tasks_classic_gp_kthread(); + show_rcu_tasks_rude_gp_kthread(); + show_rcu_tasks_trace_gp_kthread(); +} + #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} +void show_rcu_tasks_gp_kthreads(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c65c9759e038..e1c68a74574f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -649,7 +649,7 @@ void show_rcu_gp_kthreads(void) if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } - /* sched_show_task(rcu_state.gp_kthread); */ + show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -- cgit v1.2.3 From af051ca4e4231fcf5f366e28453ac28208bb36c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 12:13:33 -0700 Subject: rcu-tasks: Make rcutorture writer stall output include GP state This commit adds grace-period state and time to the rcutorture writer stall output. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e4f89425d2c4..c93fb29b460c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -17,7 +17,7 @@ typedef void (*pregp_func_t)(void); typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); typedef void (*postscan_func_t)(void); typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); -typedef void (*postgp_func_t)(void); +typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** * Definition for a Tasks-RCU-like mechanism. @@ -27,6 +27,9 @@ typedef void (*postgp_func_t)(void); * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. + * @gp_state: Grace period's most recent state transition (debugging). + * @gp_jiffies: Time of last @gp_state transition. + * @gp_start: Most recent grace-period start in jiffies. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -41,6 +44,8 @@ struct rcu_tasks { struct rcu_head **cbs_tail; struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; + int gp_state; + unsigned long gp_jiffies; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; @@ -73,10 +78,56 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +/* RCU tasks grace-period state for debugging. */ +#define RTGS_INIT 0 +#define RTGS_WAIT_WAIT_CBS 1 +#define RTGS_WAIT_GP 2 +#define RTGS_PRE_WAIT_GP 3 +#define RTGS_SCAN_TASKLIST 4 +#define RTGS_POST_SCAN_TASKLIST 5 +#define RTGS_WAIT_SCAN_HOLDOUTS 6 +#define RTGS_SCAN_HOLDOUTS 7 +#define RTGS_POST_GP 8 +#define RTGS_WAIT_READERS 9 +#define RTGS_INVOKE_CBS 10 +#define RTGS_WAIT_CBS 11 +static const char * const rcu_tasks_gp_state_names[] = { + "RTGS_INIT", + "RTGS_WAIT_WAIT_CBS", + "RTGS_WAIT_GP", + "RTGS_PRE_WAIT_GP", + "RTGS_SCAN_TASKLIST", + "RTGS_POST_SCAN_TASKLIST", + "RTGS_WAIT_SCAN_HOLDOUTS", + "RTGS_SCAN_HOLDOUTS", + "RTGS_POST_GP", + "RTGS_WAIT_READERS", + "RTGS_INVOKE_CBS", + "RTGS_WAIT_CBS", +}; + //////////////////////////////////////////////////////////////////////// // // Generic code. +/* Record grace-period phase and time. */ +static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) +{ + rtp->gp_state = newstate; + rtp->gp_jiffies = jiffies; +} + +/* Return state name. */ +static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) +{ + int i = data_race(rtp->gp_state); // Let KCSAN detect update races + int j = READ_ONCE(i); // Prevent the compiler from reading twice + + if (j >= ARRAY_SIZE(rcu_tasks_gp_state_names)) + return "???"; + return rcu_tasks_gp_state_names[j]; +} + // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) @@ -141,15 +192,18 @@ static int __noreturn rcu_tasks_kthread(void *arg) READ_ONCE(rtp->cbs_head)); if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); schedule_timeout_interruptible(HZ/10); } continue; } // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); rtp->gp_func(rtp); /* Invoke the callbacks. */ + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); while (list) { next = list->next; local_bh_disable(); @@ -160,6 +214,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) } /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_uninterruptible(HZ/10); + + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } } @@ -222,8 +278,11 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - pr_info("%s %c%c %s\n", + pr_info("%s: %s(%d) since %lu %c%c %s\n", rtp->kname, + tasks_gp_state_getname(rtp), + data_race(rtp->gp_state), + jiffies - data_race(rtp->gp_jiffies), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[!!data_race(rtp->cbs_head)], s); @@ -243,6 +302,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) LIST_HEAD(holdouts); int fract; + set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); rtp->pregp_func(); /* @@ -251,11 +311,13 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * that are not already voluntarily blocked. Mark these tasks * and make a list of them in holdouts. */ + set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); rcu_read_lock(); for_each_process_thread(g, t) rtp->pertask_func(t, &holdouts); rcu_read_unlock(); + set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); rtp->postscan_func(); /* @@ -277,6 +339,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) break; /* Slowly back off waiting for holdouts */ + set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); schedule_timeout_interruptible(HZ/fract); if (fract > 1) @@ -288,10 +351,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) lastreport = jiffies; firstreport = true; WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); rtp->holdouts_func(&holdouts, needreport, &firstreport); } - rtp->postgp_func(); + set_tasks_gp_state(rtp, RTGS_POST_GP); + rtp->postgp_func(rtp); } //////////////////////////////////////////////////////////////////////// @@ -394,7 +459,7 @@ static void check_all_holdout_tasks(struct list_head *hop, } /* Finish off the Tasks-RCU grace period. */ -static void rcu_tasks_postgp(void) +static void rcu_tasks_postgp(struct rcu_tasks *rtp) { /* * Because ->on_rq and ->nvcsw are not guaranteed to have a full @@ -881,7 +946,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, } /* Wait for grace period to complete and provide ordering. */ -static void rcu_tasks_trace_postgp(void) +static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) { bool firstreport; struct task_struct *g, *t; @@ -894,6 +959,7 @@ static void rcu_tasks_trace_postgp(void) smp_mb__after_atomic(); // Order vs. later atomics // Wait for readers. + set_tasks_gp_state(rtp, RTGS_WAIT_READERS); for (;;) { ret = wait_event_idle_exclusive_timeout( trc_wait, @@ -901,6 +967,7 @@ static void rcu_tasks_trace_postgp(void) READ_ONCE(rcu_task_stall_timeout)); if (ret) break; // Count reached zero. + // Stall warning time, so make a list of the offenders. for_each_process_thread(g, t) if (READ_ONCE(t->trc_reader_need_end)) trc_add_holdout(t, &holdouts); -- cgit v1.2.3 From 43766c3eadcf6033c92eb953f88801aebac0f785 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 20:38:29 -0700 Subject: rcu-tasks: Make RCU Tasks Trace make use of RCU scheduler hooks This commit makes the calls to rcu_tasks_qs() detect and report quiescent states for RCU tasks trace. If the task is in a quiescent state and if ->trc_reader_checked is not yet set, the task sets its own ->trc_reader_checked. This will cause the grace-period kthread to remove it from the holdout list if it still remains there. [ paulmck: Fix conditional compilation per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++-- kernel/rcu/tree_plugin.h | 6 ++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c93fb29b460c..6f8a4040fbdd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -180,7 +180,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Pick up any new callbacks. */ raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - smp_mb__after_unlock_lock(); // Order updates vs. GP. + smp_mb__after_spinlock(); // Order updates vs. GP. list = rtp->cbs_head; rtp->cbs_head = NULL; rtp->cbs_tail = &rtp->cbs_head; @@ -874,7 +874,7 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { WRITE_ONCE(t->trc_reader_need_end, false); - t->trc_reader_checked = false; + WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); } @@ -983,6 +983,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); } smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. } /* Report any needed quiescent state for this exiting task. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4f34c325dd90..37e02812d18f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -331,8 +331,7 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -841,8 +840,7 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); out: trace_rcu_utilization(TPS("End context switch")); } -- cgit v1.2.3 From 88092d0c99d7584d50cc8caadb8fa9ff8a1d4ea0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 08:57:02 -0700 Subject: rcu-tasks: Add a grace-period start time for throttling and debug This commit adds a place to record the grace-period start in jiffies. This will be used by later commits for debugging purposes and to throttle IPIs early in the grace period. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6f8a4040fbdd..71462cf3d4bd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -46,6 +46,7 @@ struct rcu_tasks { raw_spinlock_t cbs_lock; int gp_state; unsigned long gp_jiffies; + unsigned long gp_start; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; @@ -200,6 +201,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) // Wait for one grace period. set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; rtp->gp_func(rtp); /* Invoke the callbacks. */ -- cgit v1.2.3 From b0afa0f056676ffe0a7213818f09d2460adbcc16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 11:39:26 -0700 Subject: rcu-tasks: Provide boot parameter to delay IPIs until late in grace period This commit provides a rcupdate.rcu_task_ipi_delay kernel boot parameter that specifies how old the RCU tasks trace grace period must be before the grace-period kthread starts sending IPIs. This delay allows more tasks to pass through rcu_tasks_qs() quiescent states, thus reducing (or even eliminating) the number of IPIs that must be sent. On a short rcutorture test setting this kernel boot parameter to HZ/2 resulted in zero IPIs for all 877 RCU-tasks trace grace periods that elapsed during that test. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 71462cf3d4bd..eeac4a122234 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -74,6 +74,11 @@ static struct rcu_tasks rt_name = \ /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); +/* Avoid IPIing CPUs early in the grace period. */ +#define RCU_TASK_IPI_DELAY (HZ / 2) +static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; +module_param(rcu_task_ipi_delay, int, 0644); + /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; @@ -713,6 +718,10 @@ DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t) { @@ -998,10 +1007,6 @@ void exit_tasks_rcu_finish_trace(struct task_struct *t) rcu_read_unlock_trace_special(t); } -void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, - "RCU Tasks Trace"); - /** * call_rcu_tasks_trace() - Queue a callback trace task-based grace period * @rhp: structure to be used for queueing the RCU updates. -- cgit v1.2.3 From 276c410448dbca357a2bc3539acfe04862e5f172 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 16:02:06 -0700 Subject: rcu-tasks: Split ->trc_reader_need_end This commit splits ->trc_reader_need_end by using the rcu_special union. This change permits readers to check to see if a memory barrier is required without any added overhead in the common case where no such barrier is required. This commit also adds the read-side checking. Later commits will add the machinery to properly set the new ->trc_reader_special.b.need_mb field. This commit also makes rcu_read_unlock_trace_special() tolerate nested read-side critical sections within interrupt and NMI handlers. Signed-off-by: Paul E. McKenney --- kernel/fork.c | 1 + kernel/rcu/tasks.h | 33 ++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 72e9396235b4..96eb4b535ced 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1685,6 +1685,7 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index eeac4a122234..17b1b9a31071 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -723,10 +723,17 @@ DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); /* If we are the last reader, wake up the grace-period kthread. */ -void rcu_read_unlock_trace_special(struct task_struct *t) +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - WRITE_ONCE(t->trc_reader_need_end, false); - if (atomic_dec_and_test(&trc_n_readers_need_end)) + int nq = t->trc_reader_special.b.need_qs; + + if (t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) wake_up(&trc_wait); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -777,8 +784,8 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -804,8 +811,8 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // exit from that critical section. if (unlikely(t->trc_reader_nesting)) { atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); } return true; } @@ -884,7 +891,7 @@ static void rcu_tasks_trace_pregp_step(void) static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { - WRITE_ONCE(t->trc_reader_need_end, false); + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); @@ -916,7 +923,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], t->trc_reader_nesting, - " N"[!!t->trc_reader_need_end], + " N"[!!t->trc_reader_special.b.need_qs], cpu); sched_show_task(t); } @@ -980,11 +987,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) break; // Count reached zero. // Stall warning time, so make a list of the offenders. for_each_process_thread(g, t) - if (READ_ONCE(t->trc_reader_need_end)) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); firstreport = true; list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) - if (READ_ONCE(t->trc_reader_need_end)) { + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { show_stalled_task_trace(t, &firstreport); trc_del_holdout(t); } @@ -1003,8 +1010,8 @@ void exit_tasks_rcu_finish_trace(struct task_struct *t) WRITE_ONCE(t->trc_reader_checked, true); WARN_ON_ONCE(t->trc_reader_nesting); WRITE_ONCE(t->trc_reader_nesting, 0); - if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) - rcu_read_unlock_trace_special(t); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); } /** -- cgit v1.2.3 From 238dbce39ea467577ce7e41ee3e98748c436ed0f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Mar 2020 10:54:05 -0700 Subject: rcu-tasks: Add grace-period and IPI counts to statistics This commit adds a grace-period count and a count of IPIs sent since boot, which is printed in response to rcutorture writer stalls and at the end of rcutorture testing. These counts will be used to evaluate various schemes to reduce the number of IPIs sent. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 17b1b9a31071..4857450c0430 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -30,6 +30,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @gp_state: Grace period's most recent state transition (debugging). * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. + * @n_gps: Number of grace periods completed since boot. + * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -47,6 +49,8 @@ struct rcu_tasks { int gp_state; unsigned long gp_jiffies; unsigned long gp_start; + unsigned long n_gps; + unsigned long n_ipis; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; @@ -208,6 +212,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) set_tasks_gp_state(rtp, RTGS_WAIT_GP); rtp->gp_start = jiffies; rtp->gp_func(rtp); + rtp->n_gps++; /* Invoke the callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); @@ -285,11 +290,12 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - pr_info("%s: %s(%d) since %lu %c%c %s\n", + pr_info("%s: %s(%d) since %lu g:%lu i:%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), + data_race(rtp->n_gps), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[!!data_race(rtp->cbs_head)], s); @@ -592,6 +598,7 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { + rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } @@ -856,6 +863,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, atomic_inc(&trc_n_readers_need_end); per_cpu(trc_ipi_to_cpu, cpu) = true; t->trc_ipi_to_cpu = cpu; + rcu_tasks_trace.n_ipis++; if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) { // Just in case there is some other reason for -- cgit v1.2.3 From 9ae58d7bd11f1fc4c96389df11751f8593d8bd33 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Mar 2020 17:16:37 -0700 Subject: rcu-tasks: Add Kconfig option to mediate smp_mb() vs. IPI This commit provides a new TASKS_TRACE_RCU_READ_MB Kconfig option that enables use of read-side memory barriers by both rcu_read_lock_trace() and rcu_read_unlock_trace() when the are executed with the current->trc_reader_special.b.need_mb flag set. This flag is currently never set. Doing that is the subject of a later commit. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 18 ++++++++++++++++++ kernel/rcu/tasks.h | 3 ++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index cb1d18ef343c..0ebe15a84985 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -234,4 +234,22 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4857450c0430..4147857007d7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -734,7 +734,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { int nq = t->trc_reader_special.b.need_qs; - if (t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. if (nq) -- cgit v1.2.3 From 7d0c9c50c5a109acd7a5cf589fc5563f9ef7149a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Mar 2020 15:33:12 -0700 Subject: rcu-tasks: Avoid IPIing userspace/idle tasks if kernel is so built Systems running CPU-bound real-time task do not want IPIs sent to CPUs executing nohz_full userspace tasks. Battery-powered systems don't want IPIs sent to idle CPUs in low-power mode. Unfortunately, RCU tasks trace can and will send such IPIs in some cases. Both of these situations occur only when the target CPU is in RCU dyntick-idle mode, in other words, when RCU is not watching the target CPU. This suggests that CPUs in dyntick-idle mode should use memory barriers in outermost invocations of rcu_read_lock_trace() and rcu_read_unlock_trace(), which would allow the RCU tasks trace grace period to directly read out the target CPU's read-side state. One challenge is that RCU tasks trace is not targeting a specific CPU, but rather a task. And that task could switch from one CPU to another at any time. This commit therefore uses try_invoke_on_locked_down_task() and checks for task_curr() in trc_inspect_reader_notrunning(). When this condition holds, the target task is running and cannot move. If CONFIG_TASKS_TRACE_RCU_READ_MB=y, the new rcu_dynticks_zero_in_eqs() function can be used to check if the specified integer (in this case, t->trc_reader_nesting) is zero while the target CPU remains in that same dyntick-idle sojourn. If so, the target task is in a quiescent state. If not, trc_read_check_handler() must indicate failure so that the grace-period kthread can take appropriate action or retry after an appropriate delay, as the case may be. With this change, given CONFIG_TASKS_TRACE_RCU_READ_MB=y, if a given CPU remains idle or a given task continues executing in nohz_full mode, the RCU tasks trace grace-period kthread will detect this without the need to send an IPI. Suggested-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/tasks.h | 36 ++++++++++++++++++++++++++---------- kernel/rcu/tree.c | 24 ++++++++++++++++++++++++ kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 18 ++++++++++++++++++ 5 files changed, 72 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e1089fdf8626..296f9262d119 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -501,6 +501,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU +static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -510,6 +511,7 @@ static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4147857007d7..a9e8ecb10860 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -806,22 +806,38 @@ reset_ipi: /* Callback function for scheduler to check locked-down task. */ static bool trc_inspect_reader(struct task_struct *t, void *arg) { - if (task_curr(t)) - return false; // It is running, so decline to inspect it. + int cpu = task_cpu(t); + bool in_qs = false; + + if (task_curr(t)) { + // If no chance of heavyweight readers, do it the hard way. + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + return false; + + // If heavyweight readers are enabled on the remote task, + // we can inspect its state despite its currently running. + // However, we cannot safely change its state. + if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + return false; // No quiescent state, do it the hard way. + in_qs = true; + } else { + in_qs = likely(!t->trc_reader_nesting); + } // Mark as checked. Because this is called from the grace-period // kthread, also remove the task from the holdout list. t->trc_reader_checked = true; trc_del_holdout(t); - // If the task is in a read-side critical section, set up its - // its state so that it will awaken the grace-period kthread upon - // exit from that critical section. - if (unlikely(t->trc_reader_nesting)) { - atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); - WRITE_ONCE(t->trc_reader_special.b.need_qs, true); - } + if (in_qs) + return true; // Already in quiescent state, done!!! + + // The task is in a read-side critical section, so set up its + // state so that it will awaken the grace-period kthread upon exit + // from that critical section. + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); return true; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0bbcbf398169..573fd78a7bca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -252,6 +252,7 @@ static void rcu_dynticks_eqs_enter(void) * critical sections, and we also must force ordering with the * next idle sojourn. */ + rcu_dynticks_task_trace_enter(); // Before ->dynticks update! seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -278,6 +279,7 @@ static void rcu_dynticks_eqs_exit(void) */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is now watching. Better not be in an extended quiescent state! + rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { @@ -349,6 +351,28 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) return snap != rcu_dynticks_snap(rdp); } +/* + * Return true if the referenced integer is zero while the specified + * CPU remains within a single extended quiescent state. + */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int snap; + + // If not quiescent, force back to earlier extended quiescent state. + snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | + RCU_DYNTICK_CTRL_CTR); + + smp_rmb(); // Order ->dynticks and *vp reads. + if (READ_ONCE(*vp)) + return false; // Non-zero, so report failure; + smp_rmb(); // Order *vp read and ->dynticks re-read. + + // If still in the same extended quiescent state, we are good! + return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); +} + /* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9dc2ec021da5..29ba79989802 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -454,6 +454,8 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +static void rcu_dynticks_task_trace_enter(void); +static void rcu_dynticks_task_trace_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 37e02812d18f..4cef7e3bca69 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2552,3 +2552,21 @@ static void rcu_dynticks_task_exit(void) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } + +/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ +static void rcu_dynticks_task_trace_enter(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = true; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} + +/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ +static void rcu_dynticks_task_trace_exit(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = false; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} -- cgit v1.2.3 From b38f57c1fe64276773b124dffb0a139cc32ab3cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Mar 2020 14:29:08 -0700 Subject: rcu-tasks: Allow rcu_read_unlock_trace() under scheduler locks The rcu_read_unlock_trace() can invoke rcu_read_unlock_trace_special(), which in turn can call wake_up(). Therefore, if any scheduler lock is held across a call to rcu_read_unlock_trace(), self-deadlock can occur. This commit therefore uses the irq_work facility to defer the wake_up() to a clean environment where no scheduler locks will be held. Reported-by: Steven Rostedt [ paulmck: Update #includes for m68k per kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 12 +++++++++++- kernel/rcu/update.c | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index a9e8ecb10860..dd311e93ed0f 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -729,6 +729,16 @@ void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); +/* + * This irq_work handler allows rcu_read_unlock_trace() to be invoked + * while the scheduler locks are held. + */ +static void rcu_read_unlock_iw(struct irq_work *iwp) +{ + wake_up(&trc_wait); +} +static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); + /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { @@ -742,7 +752,7 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_nesting, nesting); if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) - wake_up(&trc_wait); + irq_work_queue(&rcu_tasks_trace_iw); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c5799349ff31..b1f07a0e3a56 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -41,6 +41,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS -- cgit v1.2.3 From 81b4a7bc3b54b0b839dbf3d2b8c9a353ae910688 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 10:10:07 -0700 Subject: rcu-tasks: Disable CPU hotplug across RCU tasks trace scans This commit disables CPU hotplug across RCU tasks trace scans, which is a first step towards correctly recognizing idle tasks "running" on offline CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index dd311e93ed0f..361e17d57191 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -910,16 +910,16 @@ static void rcu_tasks_trace_pregp_step(void) { int cpu; - // Wait for CPU-hotplug paths to complete. - cpus_read_lock(); - cpus_read_unlock(); - // Allow for fast-acting IPIs. atomic_set(&trc_n_readers_need_end, 1); // There shouldn't be any old IPIs, but... for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the tasklist scan. + // This also waits for all readers in CPU-hotplug code paths. + cpus_read_lock(); } /* Do first-round processing for the specified task. */ @@ -935,6 +935,9 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, /* Do intermediate processing between task and holdout scans. */ static void rcu_tasks_trace_postscan(void) { + // Re-enable CPU hotplug now that the tasklist scan has completed. + cpus_read_unlock(); + // Wait for late-stage exiting tasks to finish exiting. // These might have passed the call to exit_tasks_rcu_finish(). synchronize_rcu(); @@ -979,6 +982,9 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, { struct task_struct *g, *t; + // Disable CPU hotplug across the holdout list scan. + cpus_read_lock(); + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { // If safe and needed, try to check the current task. if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && @@ -991,6 +997,10 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, else if (needreport) show_stalled_task_trace(t, firstreport); } + + // Re-enable CPU hotplug now that the holdout list scan has completed. + cpus_read_unlock(); + if (needreport) { if (firstreport) pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); -- cgit v1.2.3 From 7e3b70e0703b48e120c3f5e65498790341120fad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 11:24:58 -0700 Subject: rcu-tasks: Handle the running-offline idle-task special case The idle task corresponding to an offline CPU can appear to be running while that CPU is offline. This commit therefore adds checks for this situation, treating it as a quiescent state. Because the tasklist scan and the holdout-list scan now exclude CPU-hotplug operations, readers on the CPU-hotplug paths are still waited for. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 361e17d57191..e3a42d8f9eeb 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -818,16 +818,20 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) { int cpu = task_cpu(t); bool in_qs = false; + bool ofl = cpu_is_offline(cpu); if (task_curr(t)) { + WARN_ON_ONCE(ofl & !is_idle_task(t)); + // If no chance of heavyweight readers, do it the hard way. - if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) return false; // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. // However, we cannot safely change its state. - if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + if (!ofl && // Check for "running" idle tasks on offline CPUs. + !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return false; // No quiescent state, do it the hard way. in_qs = true; } else { -- cgit v1.2.3 From 9796e1ae7386ecf66eb234f7db7753845ebb2139 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 13:18:54 -0700 Subject: rcu-tasks: Make RCU tasks trace also wait for idle tasks This commit scans the CPUs, adding each CPU's idle task to the list of tasks that need quiescent states. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e3a42d8f9eeb..f272e8f16b81 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -15,7 +15,7 @@ struct rcu_tasks; typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); typedef void (*pregp_func_t)(void); typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); -typedef void (*postscan_func_t)(void); +typedef void (*postscan_func_t)(struct list_head *hop); typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); typedef void (*postgp_func_t)(struct rcu_tasks *rtp); @@ -331,7 +331,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) rcu_read_unlock(); set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); - rtp->postscan_func(); + rtp->postscan_func(&holdouts); /* * Each pass through the following loop scans the list of holdout @@ -415,7 +415,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } /* Processing between scanning taskslist and draining the holdout list. */ -void rcu_tasks_postscan(void) +void rcu_tasks_postscan(struct list_head *hop) { /* * Wait for tasks that are in the process of exiting. This @@ -936,9 +936,17 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, trc_wait_for_one_reader(t, hop); } -/* Do intermediate processing between task and holdout scans. */ -static void rcu_tasks_trace_postscan(void) +/* + * Do intermediate processing between task and holdout scans and + * pick up the idle tasks. + */ +static void rcu_tasks_trace_postscan(struct list_head *hop) { + int cpu; + + for_each_possible_cpu(cpu) + rcu_tasks_trace_pertask(idle_task(cpu), hop); + // Re-enable CPU hotplug now that the tasklist scan has completed. cpus_read_unlock(); -- cgit v1.2.3 From 40471509be3cb8c9c02aec1c316614cb96e6fe85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 13:34:34 -0700 Subject: rcu-tasks: Add rcu_dynticks_zero_in_eqs() effectiveness statistics This commit adds counts of the number of calls and number of successful calls to rcu_dynticks_zero_in_eqs(), which are printed at the end of rcutorture runs and at stall time. This allows evaluation of the effectiveness of rcu_dynticks_zero_in_eqs(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index f272e8f16b81..ce658831c759 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -725,6 +725,11 @@ DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); +// The number of detections of task quiescent state relying on +// heavyweight readers executing explicit memory barriers. +unsigned long n_heavy_reader_attempts; +unsigned long n_heavy_reader_updates; + void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); @@ -830,9 +835,11 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. // However, we cannot safely change its state. + n_heavy_reader_attempts++; if (!ofl && // Check for "running" idle tasks on offline CPUs. !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return false; // No quiescent state, do it the hard way. + n_heavy_reader_updates++; in_qs = true; } else { in_qs = likely(!t->trc_reader_nesting); @@ -1147,9 +1154,11 @@ core_initcall(rcu_spawn_tasks_trace_kthread); static void show_rcu_tasks_trace_gp_kthread(void) { - char buf[32]; + char buf[64]; - sprintf(buf, "N%d", atomic_read(&trc_n_readers_need_end)); + sprintf(buf, "N%d h:%lu/%lu", atomic_read(&trc_n_readers_need_end), + data_race(n_heavy_reader_updates), + data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } -- cgit v1.2.3 From edf3775f0ad66879796f594983163f672c4bf1a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Mar 2020 14:09:45 -0700 Subject: rcu-tasks: Add count for idle tasks on offline CPUs This commit adds a counter for the number of times the quiescent state was an idle task associated with an offline CPU, and prints this count at the end of rcutorture runs and at stall time. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce658831c759..0d1b5bf8317d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -729,6 +729,7 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); // heavyweight readers executing explicit memory barriers. unsigned long n_heavy_reader_attempts; unsigned long n_heavy_reader_updates; +unsigned long n_heavy_reader_ofl_updates; void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, @@ -840,6 +841,8 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return false; // No quiescent state, do it the hard way. n_heavy_reader_updates++; + if (ofl) + n_heavy_reader_ofl_updates++; in_qs = true; } else { in_qs = likely(!t->trc_reader_nesting); @@ -1156,7 +1159,8 @@ static void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%d h:%lu/%lu", atomic_read(&trc_n_readers_need_end), + sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); -- cgit v1.2.3 From 7e0669c3e9dec367ecb63062898c70c1c596b749 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Mar 2020 14:36:05 -0700 Subject: rcu-tasks: Add IPI failure count to statistics This commit adds a failure-return count for smp_call_function_single(), and adds this to the console messages for rcutorture writer stalls and at the end of rcutorture testing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 0d1b5bf8317d..0a580eff7c34 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @gp_start: Most recent grace-period start in jiffies. * @n_gps: Number of grace periods completed since boot. * @n_ipis: Number of IPIs sent to encourage grace periods to end. + * @n_ipis_fails: Number of IPI-send failures. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -51,6 +52,7 @@ struct rcu_tasks { unsigned long gp_start; unsigned long n_gps; unsigned long n_ipis; + unsigned long n_ipis_fails; struct task_struct *kthread_ptr; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; @@ -290,12 +292,12 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - pr_info("%s: %s(%d) since %lu g:%lu i:%lu %c%c %s\n", + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, - tasks_gp_state_getname(rtp), - data_race(rtp->gp_state), + tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), - data_race(rtp->n_gps), data_race(rtp->n_ipis), + data_race(rtp->n_gps), + data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[!!data_race(rtp->cbs_head)], s); @@ -909,6 +911,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, trc_read_check_handler, t, 0)) { // Just in case there is some other reason for // failure than the target CPU being offline. + rcu_tasks_trace.n_ipis_fails++; per_cpu(trc_ipi_to_cpu, cpu) = false; t->trc_ipi_to_cpu = cpu; if (atomic_dec_and_test(&trc_n_readers_need_end)) { -- cgit v1.2.3 From 25246fc83155b254534ce579fb713828fb5e621a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 5 Apr 2020 20:49:13 -0700 Subject: rcu-tasks: Allow standalone use of TASKS_{TRACE_,}RCU This commit allows TASKS_TRACE_RCU to be used independently of TASKS_RCU and vice versa. [ paulmck: Fix conditional compilation per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 0a580eff7c34..ce23f6cc5043 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -244,27 +244,6 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) smp_mb(); /* Ensure others see full kthread. */ } -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -static void exit_tasks_rcu_finish_trace(struct task_struct *t); - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) -{ - struct task_struct *t = current; - - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); - preempt_enable(); - exit_tasks_rcu_finish_trace(t); -} - #ifndef CONFIG_TINY_RCU /* @@ -303,7 +282,9 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) s); } -#ifdef CONFIG_TASKS_RCU +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) //////////////////////////////////////////////////////////////////////// // @@ -374,6 +355,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) rtp->postgp_func(rtp); } +#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ + +#ifdef CONFIG_TASKS_RCU + //////////////////////////////////////////////////////////////////////// // // Simple variant of RCU whose quiescent states are voluntary context @@ -577,8 +562,29 @@ static void show_rcu_tasks_classic_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +{ + struct task_struct *t = current; + + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + preempt_enable(); + exit_tasks_rcu_finish_trace(t); +} + #else /* #ifdef CONFIG_TASKS_RCU */ static void show_rcu_tasks_classic_gp_kthread(void) { } +void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU @@ -1075,7 +1081,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) } /* Report any needed quiescent state for this exiting task. */ -void exit_tasks_rcu_finish_trace(struct task_struct *t) +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { WRITE_ONCE(t->trc_reader_checked, true); WARN_ON_ONCE(t->trc_reader_nesting); @@ -1170,7 +1176,7 @@ static void show_rcu_tasks_trace_gp_kthread(void) } #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ -void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ -- cgit v1.2.3 From e5a971d76d701dbff9e5dbaa84dc9e8c3081a867 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Apr 2020 12:10:28 -0700 Subject: ftrace: Use synchronize_rcu_tasks_rude() instead of ftrace_sync() This commit replaces the schedule_on_each_cpu(ftrace_sync) instances with synchronize_rcu_tasks_rude(). Suggested-by: Steven Rostedt Cc: Ingo Molnar [ paulmck: Make Kconfig adjustments noted by kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/trace/Kconfig | 1 + kernel/trace/ftrace.c | 17 +++-------------- 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 402eef84c859..ae69010d521a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -158,6 +158,7 @@ config FUNCTION_TRACER select CONTEXT_SWITCH_TRACER select GLOB select TASKS_RCU if PREEMPTION + select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 041694a1eb74..771eace959f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -160,17 +160,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_rcu(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ @@ -256,7 +245,7 @@ static void update_ftrace_function(void) * Make sure all CPUs see this. Yes this is slow, but static * tracing is slow and nasty to have enabled. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* Now all cpus are using the list ops. */ function_trace_op = set_function_trace_op; /* Make sure the function_trace_op is visible on all CPUs */ @@ -2932,7 +2921,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* * When the kernel is preeptive, tasks can be preempted @@ -5887,7 +5876,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); free_ftrace_hash(old_hash); } -- cgit v1.2.3 From 654db05cee8186cf9438d94ef32a4f9ffe964e57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Feb 2020 02:35:22 -0800 Subject: rcu: Use data_race() for RCU expedited CPU stall-warning prints Although the accesses used to determine whether or not an expedited stall should be printed are an integral part of the concurrency algorithm governing use of the corresponding variables, the values that are simply printed are ancillary. As such, it is best to use data_race() for these accesses in order to provide the greatest latitude in the use of KCSAN for the other accesses that are an integral part of the algorithm. This commit therefore changes the relevant uses of READ_ONCE() to data_race(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..e1a7986f15b4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } -- cgit v1.2.3 From 88375825171c7de5f1e68ac6fd5d35d3b831da3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Mar 2020 19:00:52 -0700 Subject: rcu: When GP kthread is starved, tag idle threads as false positives If the grace-period kthread is starved, idle threads' extended quiescent states are not reported. These idle threads thus wrongly appear to be blocking the current grace period. This commit therefore tags such idle threads as probable false positives when the grace-period kthread is being starved. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..4da41a613ece 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -299,6 +299,16 @@ static const char *gp_state_getname(short gs) return gp_state_names[gs]; } +/* Is the RCU grace-period kthread being starved of CPU time? */ +static bool rcu_is_gp_kthread_starving(unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -313,6 +323,7 @@ static const char *gp_state_getname(short gs) static void print_cpu_stall_info(int cpu) { unsigned long delta; + bool falsepositive; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; @@ -333,7 +344,9 @@ static void print_cpu_stall_info(int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + falsepositive = rcu_is_gp_kthread_starving(NULL) && + rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -345,8 +358,9 @@ static void print_cpu_stall_info(int cpu) rcu_dynticks_snap(rdp) & 0xfff, rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); + data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz, + falsepositive ? " (false positive?)" : ""); } /* Complain about starvation of grace-period kthread. */ @@ -355,8 +369,7 @@ static void rcu_check_gp_kthread_starvation(void) struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { + if (rcu_is_gp_kthread_starving(&j)) { pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), @@ -364,6 +377,7 @@ static void rcu_check_gp_kthread_starvation(void) gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); wake_up_process(gpk); -- cgit v1.2.3 From 33b2b93bd831fc0e994654cef3d046c713e3b55e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Apr 2020 14:12:07 -0700 Subject: rcu: Remove self-stack-trace when all quiescent states seen When all quiescent states have been seen, it is normally the grace-period kthread that is in trouble. Although the existing stack trace from the current CPU might possibly provide useful information, experience indicates that there is too much noise for this to be worthwhile. This commit therefore removes this stack trace from the output. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4da41a613ece..535762b07543 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -440,8 +440,6 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_state.name, j - gpa, j, gpa, READ_ONCE(jiffies_till_next_fqs), rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); } } /* Rewrite if needed in case of slow consoles. */ -- cgit v1.2.3 From 3b2a47398552938d2ae0091f35eb3658a52a0769 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2020 16:30:35 -0700 Subject: rcutorture: Add KCSAN stubs This commit adds stubs for KCSAN's data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros to allow code using these macros to move ahead. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5453bd557f43..7e2ea0c57433 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -- cgit v1.2.3 From c9527bebb017b891d1a2bbb96217bd5225488a0e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Feb 2020 13:41:02 -0800 Subject: rcutorture: Mark data-race potential for rcu_barrier() test statistics The n_barrier_successes, n_barrier_attempts, and n_rcu_torture_barrier_error variables are updated (without access markings) by the main rcu_barrier() test kthread, and accessed (also without access markings) by the rcu_torture_stats() kthread. This of course can result in KCSAN complaints. Because the accesses are in diagnostic prints, this commit uses data_race() to excuse the diagnostic prints from the data race. If this were to ever cause bogus statistics prints (for example, due to store tearing), any misleading information would be disambiguated by the presence or absence of an rcutorture splat. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to the mild consequences of the failure, namely a confusing rcutorture console message. Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e2ea0c57433..d0345d14e22a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1456,9 +1456,9 @@ rcu_torture_stats_print(void) atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld\n", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); + data_race(n_barrier_successes), + data_race(n_barrier_attempts), + data_race(n_rcu_torture_barrier_error)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || -- cgit v1.2.3 From 52785b6ae8eded7ac99d65c92d989b702e5b4376 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 17 Apr 2020 02:58:37 +0000 Subject: kcsan: Use GFP_ATOMIC under spin lock A spin lock is held in insert_report_filterlist(), so the krealloc() should use GFP_ATOMIC. This commit therefore makes this change. Reviewed-by: Marco Elver Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 1a08664a7fab..023e49c58d55 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -230,7 +230,7 @@ static ssize_t insert_report_filterlist(const char *func) /* initial allocation */ report_filterlist.addrs = kmalloc_array(report_filterlist.size, - sizeof(unsigned long), GFP_KERNEL); + sizeof(unsigned long), GFP_ATOMIC); if (report_filterlist.addrs == NULL) { ret = -ENOMEM; goto out; @@ -240,7 +240,7 @@ static ssize_t insert_report_filterlist(const char *func) size_t new_size = report_filterlist.size * 2; unsigned long *new_addrs = krealloc(report_filterlist.addrs, - new_size * sizeof(unsigned long), GFP_KERNEL); + new_size * sizeof(unsigned long), GFP_ATOMIC); if (new_addrs == NULL) { /* leave filterlist itself untouched */ -- cgit v1.2.3 From 8c1b2bf16d5944cd5c3a8a72e24ed9e22360c1af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:34 +0200 Subject: bpf, cgroup: Remove unused exports Except for a few of the networking hooks called from modular ipv4 or ipv6 code, all of hooks are just called from guaranteed to be built-in code. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/20200424064338.538313-2-hch@lst.de --- kernel/bpf/cgroup.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4d748c5785bc..fc7c7002fd37 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1054,7 +1054,6 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return !allow; } -EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -1207,7 +1206,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, return ret == 1 ? 0 : -EPERM; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, @@ -1312,7 +1310,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, char __user *optval, @@ -1399,7 +1396,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, -- cgit v1.2.3 From 182e073f68a080a29920f6dca796ccf4806b0329 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Sat, 18 Apr 2020 00:40:04 +0800 Subject: cpu/hotplug: Fix a typo in comment "broadacasted"->"broadcasted" Signed-off-by: Ethon Paul Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200417164008.6541-1-ethp@qq.com --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..6a02d4434cf7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -432,7 +432,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each - * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); -- cgit v1.2.3 From 9d2161bed4e39ef7a5e5f18f69c4a57d001051b9 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:37:04 -0400 Subject: audit: log audit netlink multicast bind and unbind Log information about programs connecting to and disconnecting from the audit netlink multicast socket. This is needed so that during investigations a security officer can tell who or what had access to the audit trail. This helps to meet the FAU_SAR.2 requirement for Common Criteria. Here is the systemd startup event: type=PROCTITLE msg=audit(2020-04-22 10:10:21.787:10) : proctitle=/init type=SYSCALL msg=audit(2020-04-22 10:10:21.787:10) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x19 a1=0x555f4aac7e90 a2=0xc a3=0x7ffcb792ff44 items=0 ppid=0 pid=1 auid=unset uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=(none) ses=unset comm=systemd exe=/usr/lib/systemd/systemd subj=kernel key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 10:10:21.787:10) : pid=1 uid=root auid=unset tty=(none) ses=unset subj=kernel comm=systemd exe=/usr/lib/systemd/systemd nl-mcgrp=1 op=connect res=yes And events from the test suite that just uses close(): type=PROCTITLE msg=audit(2020-04-22 11:47:08.501:442) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:47:08.501:442) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x7 a1=0x563004378760 a2=0xc a3=0x0 items=0 ppid=815 pid=818 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:47:08.501:442) : pid=818 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=connect res=yes type=UNKNOWN[1335] msg=audit(2020-04-22 11:47:08.501:443) : pid=818 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=disconnect res=yes And the events from the test suite using setsockopt with NETLINK_DROP_MEMBERSHIP: type=PROCTITLE msg=audit(2020-04-22 11:39:53.291:439) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:39:53.291:439) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x7 a1=0x5560877c2d20 a2=0xc a3=0x0 items=0 ppid=772 pid=775 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:39:53.291:439) : pid=775 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=connect res=yes type=PROCTITLE msg=audit(2020-04-22 11:39:53.292:440) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:39:53.292:440) : arch=x86_64 syscall=setsockopt success=yes exit=0 a0=0x7 a1=SOL_NETLINK a2=0x2 a3=0x7ffc8366f000 items=0 ppid=772 pid=775 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:39:53.292:440) : pid=775 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=disconnect res=yes Please see the upstream issue tracker at https://github.com/linux-audit/audit-kernel/issues/28 With the feature description at https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Multicast-Socket-Join-Part The testsuite support is at https://github.com/rgbriggs/audit-testsuite/compare/ghak28-mcast-part-join https://github.com/linux-audit/audit-testsuite/pull/93 And the userspace support patch is at https://github.com/linux-audit/audit-userspace/pull/114 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 622c30246d19..e33460e01b3b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1530,20 +1530,60 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_unlock(); } +/* Log information about who is connecting to the audit multicast socket */ +static void audit_log_multicast(int group, const char *op, int err) +{ + const struct cred *cred; + struct tty_struct *tty; + char comm[sizeof(current->comm)]; + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); + if (!ab) + return; + + cred = current_cred(); + tty = audit_get_tty(); + audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", + task_pid_nr(current), + from_kuid(&init_user_ns, cred->uid), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(current)); + audit_put_tty(tty); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_d_path_exe(ab, current->mm); /* exe= */ + audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); + audit_log_end(ab); +} + /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(struct net *net, int group) +static int audit_multicast_bind(struct net *net, int group) { + int err = 0; + if (!capable(CAP_AUDIT_READ)) - return -EPERM; + err = -EPERM; + audit_log_multicast(group, "connect", err); + return err; +} - return 0; +static void audit_multicast_unbind(struct net *net, int group) +{ + audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, - .bind = audit_bind, + .bind = audit_multicast_bind, + .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; -- cgit v1.2.3 From 3740d93e37902b31159a82da2d5c8812ed825404 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 16 Apr 2020 16:28:59 +0000 Subject: coredump: fix crash when umh is disabled Commit 64e90a8acb859 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()") added the optiont to disable all call_usermodehelper() calls by setting STATIC_USERMODEHELPER_PATH to an empty string. When this is done, and crashdump is triggered, it will crash on null pointer dereference, since we make assumptions over what call_usermodehelper_exec() did. This has been reported by Sergey when one triggers a a coredump with the following configuration: ``` CONFIG_STATIC_USERMODEHELPER=y CONFIG_STATIC_USERMODEHELPER_PATH="" kernel.core_pattern = |/usr/lib/systemd/systemd-coredump %P %u %g %s %t %c %h %e ``` The way disabling the umh was designed was that call_usermodehelper_exec() would just return early, without an error. But coredump assumes certain variables are set up for us when this happens, and calls ile_start_write(cprm.file) with a NULL file. [ 2.819676] BUG: kernel NULL pointer dereference, address: 0000000000000020 [ 2.819859] #PF: supervisor read access in kernel mode [ 2.820035] #PF: error_code(0x0000) - not-present page [ 2.820188] PGD 0 P4D 0 [ 2.820305] Oops: 0000 [#1] SMP PTI [ 2.820436] CPU: 2 PID: 89 Comm: a Not tainted 5.7.0-rc1+ #7 [ 2.820680] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014 [ 2.821150] RIP: 0010:do_coredump+0xd80/0x1060 [ 2.821385] Code: e8 95 11 ed ff 48 c7 c6 cc a7 b4 81 48 8d bd 28 ff ff ff 89 c2 e8 70 f1 ff ff 41 89 c2 85 c0 0f 84 72 f7 ff ff e9 b4 fe ff ff <48> 8b 57 20 0f b7 02 66 25 00 f0 66 3d 00 8 0 0f 84 9c 01 00 00 44 [ 2.822014] RSP: 0000:ffffc9000029bcb8 EFLAGS: 00010246 [ 2.822339] RAX: 0000000000000000 RBX: ffff88803f860000 RCX: 000000000000000a [ 2.822746] RDX: 0000000000000009 RSI: 0000000000000282 RDI: 0000000000000000 [ 2.823141] RBP: ffffc9000029bde8 R08: 0000000000000000 R09: ffffc9000029bc00 [ 2.823508] R10: 0000000000000001 R11: ffff88803dec90be R12: ffffffff81c39da0 [ 2.823902] R13: ffff88803de84400 R14: 0000000000000000 R15: 0000000000000000 [ 2.824285] FS: 00007fee08183540(0000) GS:ffff88803e480000(0000) knlGS:0000000000000000 [ 2.824767] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2.825111] CR2: 0000000000000020 CR3: 000000003f856005 CR4: 0000000000060ea0 [ 2.825479] Call Trace: [ 2.825790] get_signal+0x11e/0x720 [ 2.826087] do_signal+0x1d/0x670 [ 2.826361] ? force_sig_info_to_task+0xc1/0xf0 [ 2.826691] ? force_sig_fault+0x3c/0x40 [ 2.826996] ? do_trap+0xc9/0x100 [ 2.827179] exit_to_usermode_loop+0x49/0x90 [ 2.827359] prepare_exit_to_usermode+0x77/0xb0 [ 2.827559] ? invalid_op+0xa/0x30 [ 2.827747] ret_from_intr+0x20/0x20 [ 2.827921] RIP: 0033:0x55e2c76d2129 [ 2.828107] Code: 2d ff ff ff e8 68 ff ff ff 5d c6 05 18 2f 00 00 01 c3 0f 1f 80 00 00 00 00 c3 0f 1f 80 00 00 00 00 e9 7b ff ff ff 55 48 89 e5 <0f> 0b b8 00 00 00 00 5d c3 66 2e 0f 1f 84 0 0 00 00 00 00 0f 1f 40 [ 2.828603] RSP: 002b:00007fffeba5e080 EFLAGS: 00010246 [ 2.828801] RAX: 000055e2c76d2125 RBX: 0000000000000000 RCX: 00007fee0817c718 [ 2.829034] RDX: 00007fffeba5e188 RSI: 00007fffeba5e178 RDI: 0000000000000001 [ 2.829257] RBP: 00007fffeba5e080 R08: 0000000000000000 R09: 00007fee08193c00 [ 2.829482] R10: 0000000000000009 R11: 0000000000000000 R12: 000055e2c76d2040 [ 2.829727] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 2.829964] CR2: 0000000000000020 [ 2.830149] ---[ end trace ceed83d8c68a1bf1 ]--- ``` Cc: # v4.11+ Fixes: 64e90a8acb85 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=199795 Reported-by: Tony Vroon Reported-by: Sergey Kvachonok Tested-by: Sergei Trofimovich Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20200416162859.26518-1-mcgrof@kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/umh.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..11bf5eea474c 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -544,6 +544,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob); * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of system workqueues. * (ie. it runs with full root capabilities and optimized affinity). + * + * Note: successful return value does not guarantee the helper was called at + * all. You can't rely on sub_info->{init,cleanup} being called even for + * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers + * into a successful no-op. */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { -- cgit v1.2.3 From 6b03d1304a32dc8450c7516000a0fe07bef7c446 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 19 Apr 2020 06:35:02 -0500 Subject: proc: Ensure we see the exit of each process tid exactly once When the thread group leader changes during exec and the old leaders thread is reaped proc_flush_pid will flush the dentries for the entire process because the leader still has it's original pid. Fix this by exchanging the pids in an rcu safe manner, and wrapping the code to do that up in a helper exchange_tids. When I removed switch_exec_pids and introduced this behavior in d73d65293e3e ("[PATCH] pidhash: kill switch_exec_pids") there really was nothing that cared as flushing happened with the cached dentry and de_thread flushed both of them on exec. This lack of fully exchanging pids became a problem a few months later when I introduced 48e6484d4902 ("[PATCH] proc: Rewrite the proc dentry flush on exit optimization"). Which overlooked the de_thread case was no longer swapping pids, and I was looking up proc dentries by task->pid. The current behavior isn't properly a bug as everything in proc will continue to work correctly just a little bit less efficiently. Fix this just so there are no little surprise corner cases waiting to bite people. -- Oleg points out this could be an issue in next_tgid in proc where has_group_leader_pid is called, and reording some of the assignments should fix that. -- Oleg points out this will break the 10 year old hack in __exit_signal.c > /* > * This can only happen if the caller is de_thread(). > * FIXME: this is the temporary hack, we should teach > * posix-cpu-timers to handle this case correctly. > */ > if (unlikely(has_group_leader_pid(tsk))) > posix_cpu_timers_exit_group(tsk); The code in next_tgid has been changed to use PIDTYPE_TGID, and the posix cpu timers code has been fixed so it does not need the 10 year old hack, so this should be safe to merge now. Link: https://lore.kernel.org/lkml/87h7x3ajll.fsf_-_@x220.int.ebiederm.org/ Acked-by: Linus Torvalds Acked-by: Oleg Nesterov Fixes: 48e6484d4902 ("[PATCH] proc: Rewrite the proc dentry flush on exit optimization"). Signed-off-by: Eric W. Biederman --- kernel/pid.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index c835b844aca7..6d5d0a5bda82 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -363,6 +363,25 @@ void change_pid(struct task_struct *task, enum pid_type type, attach_pid(task, type); } +void exchange_tids(struct task_struct *left, struct task_struct *right) +{ + struct pid *pid1 = left->thread_pid; + struct pid *pid2 = right->thread_pid; + struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; + struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + + /* Swap the single entry tid lists */ + hlists_swap_heads_rcu(head1, head2); + + /* Swap the per task_struct pid */ + rcu_assign_pointer(left->thread_pid, pid2); + rcu_assign_pointer(right->thread_pid, pid1); + + /* Swap the cached value */ + WRITE_ONCE(left->pid, pid_nr(pid2)); + WRITE_ONCE(right->pid, pid_nr(pid1)); +} + /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) -- cgit v1.2.3 From 8feebc6713cd78cbba8c4e2a6d92299669052026 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Apr 2020 14:32:54 -0500 Subject: posix-cpu-timer: Tidy up group_leader logic in lookup_task Replace has_group_leader_pid with thread_group_leader. Years ago Oleg suggested changing thread_group_leader to has_group_leader_pid to handle races. Looking at the code then and now I don't see how it ever helped. Especially as then the code really did need to be the thread_group_leader. Today it doesn't make a difference if thread_group_leader races with de_thread as the task returned from lookup_task in the non-thread case is just used to find values in task->signal. Since the races with de_thread have never been handled revert has_group_header_pid to thread_group_leader for clarity. Update the comment in lookup_task to remove implementation details that are no longer true and to mention task->signal instead of task->sighand, as the relevant cpu timer details are all in task->signal. Ref: 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") Ref: c0deae8c9587 ("posix-cpu-timers: Rcu_read_lock/unlock protect find_task_by_vpid call") Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2fd3b3fa68bf..e4051e417bcb 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -69,12 +69,8 @@ static struct task_struct *lookup_task(const pid_t pid, bool thread, if (gettime) { /* * For clock_gettime(PROCESS) the task does not need to be - * the actual group leader. tsk->sighand gives + * the actual group leader. task->signal gives * access to the group's clock. - * - * Timers need the group leader because they take a - * reference on it and store the task pointer until the - * timer is destroyed. */ return (p == current || thread_group_leader(p)) ? p : NULL; } @@ -82,7 +78,7 @@ static struct task_struct *lookup_task(const pid_t pid, bool thread, /* * For processes require that p is group leader. */ - return has_group_leader_pid(p) ? p : NULL; + return thread_group_leader(p) ? p : NULL; } static struct task_struct *__get_task_for_clock(const clockid_t clock, -- cgit v1.2.3 From c7f5194054e103d18d267385b866b5b90511a425 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 28 Apr 2020 13:00:39 -0500 Subject: posix-cpu-timer: Unify the now redundant code in lookup_task Now that both !thread paths through lookup_task call thread_group_leader, unify them into the single test at the end of lookup_task. This unification just makes it clear what is happening in the gettime special case of lookup_task. Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e4051e417bcb..b7f972fb115e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -66,14 +66,13 @@ static struct task_struct *lookup_task(const pid_t pid, bool thread, if (thread) return same_thread_group(p, current) ? p : NULL; - if (gettime) { - /* - * For clock_gettime(PROCESS) the task does not need to be - * the actual group leader. task->signal gives - * access to the group's clock. - */ - return (p == current || thread_group_leader(p)) ? p : NULL; - } + /* + * For clock_gettime(PROCESS) the task does not need to be + * the actual group leader. task->signal gives + * access to the group's clock. + */ + if (gettime && (p == current)) + return p; /* * For processes require that p is group leader. -- cgit v1.2.3 From c4dad0aab3fca0c1f0baa4cc84b6ec91b7ebf426 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:39:28 -0400 Subject: audit: tidy and extend netfilter_cfg x_tables NETFILTER_CFG record generation was inconsistent for x_tables and ebtables configuration changes. The call was needlessly messy and there were supporting records missing at times while they were produced when not requested. Simplify the logging call into a new audit_log_nfcfg call. Honour the audit_enabled setting while more consistently recording information including supporting records by tidying up dummy checks. Add an op= field that indicates the operation being performed (register or replace). Here is the enhanced sample record: type=NETFILTER_CFG msg=audit(1580905834.919:82970): table=filter family=2 entries=83 op=replace Generate audit NETFILTER_CFG records on ebtables table registration. Previously this was being done for x_tables registration and replacement operations and ebtables table replacement only. See: https://github.com/linux-audit/audit-kernel/issues/25 See: https://github.com/linux-audit/audit-kernel/issues/35 See: https://github.com/linux-audit/audit-kernel/issues/43 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 814406a35db1..705beac0ce29 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -130,6 +130,16 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; +struct audit_nfcfgop_tab { + enum audit_nfcfgop op; + const char *s; +}; + +const struct audit_nfcfgop_tab audit_nfcfgs[] = { + { AUDIT_XT_OP_REGISTER, "register" }, + { AUDIT_XT_OP_REPLACE, "replace" }, +}; + static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; @@ -2542,6 +2552,20 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } +void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, + enum audit_nfcfgop op) +{ + struct audit_buffer *ab; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); + if (!ab) + return; + audit_log_format(ab, "table=%s family=%u entries=%u op=%s", + name, af, nentries, audit_nfcfgs[op].s); + audit_log_end(ab); +} +EXPORT_SYMBOL_GPL(__audit_log_nfcfg); + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; -- cgit v1.2.3 From a45d88530b2552ad5ea0da18861600b4ecc9d0c7 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:39:29 -0400 Subject: netfilter: add audit table unregister actions Audit the action of unregistering ebtables and x_tables. See: https://github.com/linux-audit/audit-kernel/issues/44 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 705beac0ce29..d281c18d1771 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -136,8 +136,9 @@ struct audit_nfcfgop_tab { }; const struct audit_nfcfgop_tab audit_nfcfgs[] = { - { AUDIT_XT_OP_REGISTER, "register" }, - { AUDIT_XT_OP_REPLACE, "replace" }, + { AUDIT_XT_OP_REGISTER, "register" }, + { AUDIT_XT_OP_REPLACE, "replace" }, + { AUDIT_XT_OP_UNREGISTER, "unregister" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) -- cgit v1.2.3 From f9d041271cf44ca02eed0cc82e1a6d8c814c53ed Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:05 -0700 Subject: bpf: Refactor bpf_link update handling Make bpf_link update support more generic by making it into another bpf_link_ops methods. This allows generic syscall handling code to be agnostic to various conditionally compiled features (e.g., the case of CONFIG_CGROUP_BPF). This also allows to keep link type-specific code to remain static within respective code base. Refactor existing bpf_cgroup_link code and take advantage of this. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-2-andriin@fb.com --- kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 11 ++++------- kernel/cgroup/cgroup.c | 27 --------------------------- 3 files changed, 32 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bf634959885c..da6e48e802b2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -557,8 +557,9 @@ found: * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, - struct bpf_prog *new_prog) +static int __cgroup_bpf_replace(struct cgroup *cgrp, + struct bpf_cgroup_link *link, + struct bpf_prog *new_prog) { struct list_head *progs = &cgrp->bpf.progs[link->type]; struct bpf_prog *old_prog; @@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, return 0; } +static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_cgroup_link *cg_link; + int ret; + + cg_link = container_of(link, struct bpf_cgroup_link, link); + + mutex_lock(&cgroup_mutex); + /* link might have been auto-released by dying cgroup, so fail */ + if (!cg_link->cgroup) { + ret = -EINVAL; + goto out_unlock; + } + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); +out_unlock: + mutex_unlock(&cgroup_mutex); + return ret; +} + static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, @@ -811,6 +836,7 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .update_prog = cgroup_bpf_replace, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7626b8024471..f5358e1462eb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3645,13 +3645,10 @@ static int link_update(union bpf_attr *attr) goto out_put_progs; } -#ifdef CONFIG_CGROUP_BPF - if (link->ops == &bpf_cgroup_link_lops) { - ret = cgroup_bpf_replace(link, old_prog, new_prog); - goto out_put_progs; - } -#endif - ret = -EINVAL; + if (link->ops->update_prog) + ret = link->ops->update_prog(link, new_prog, old_prog); + else + ret = EINVAL; out_put_progs: if (old_prog) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..557a9b9d2244 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6508,33 +6508,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp, return ret; } -int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, - struct bpf_prog *new_prog) -{ - struct bpf_cgroup_link *cg_link; - int ret; - - if (link->ops != &bpf_cgroup_link_lops) - return -EINVAL; - - cg_link = container_of(link, struct bpf_cgroup_link, link); - - mutex_lock(&cgroup_mutex); - /* link might have been auto-released by dying cgroup, so fail */ - if (!cg_link->cgroup) { - ret = -EINVAL; - goto out_unlock; - } - if (old_prog && link->prog != old_prog) { - ret = -EPERM; - goto out_unlock; - } - ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); -out_unlock: - mutex_unlock(&cgroup_mutex); - return ret; -} - int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type) { -- cgit v1.2.3 From a3b80e1078943dc12553166fb08e258463dec013 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:06 -0700 Subject: bpf: Allocate ID for bpf_link Generate ID for each bpf_link using IDR, similarly to bpf_map and bpf_prog. bpf_link creation, initialization, attachment, and exposing to user-space through FD and ID is a complicated multi-step process, abstract it away through bpf_link_primer and bpf_link_prime(), bpf_link_settle(), and bpf_link_cleanup() internal API. They guarantee that until bpf_link is properly attached, user-space won't be able to access partially-initialized bpf_link either from FD or ID. All this allows to simplify bpf_link attachment and error handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-3-andriin@fb.com --- kernel/bpf/cgroup.c | 14 +++-- kernel/bpf/syscall.c | 143 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 104 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da6e48e802b2..1bdf37fca879 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -841,10 +841,10 @@ const struct bpf_link_ops bpf_cgroup_link_lops = { int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_cgroup_link *link; - struct file *link_file; struct cgroup *cgrp; - int err, link_fd; + int err; if (attr->link_create.flags) return -EINVAL; @@ -862,22 +862,20 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_cgroup; } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, BPF_F_ALLOW_MULTI); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_cgroup; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_cgroup: cgroup_put(cgrp); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f5358e1462eb..5439e05e3d25 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -42,6 +42,8 @@ static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); +static DEFINE_IDR(link_idr); +static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -2181,25 +2183,38 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog) +void bpf_link_init(struct bpf_link *link, + const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->id = 0; link->ops = ops; link->prog = prog; } +static void bpf_link_free_id(int id) +{ + if (!id) + return; + + spin_lock_bh(&link_idr_lock); + idr_remove(&link_idr, id); + spin_unlock_bh(&link_idr_lock); +} + /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper manages marking bpf_link as - * defunct, releases anon_inode file and puts reserved FD. + * anon_inode's release() call. This helper marksbpf_link as + * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt + * is not decremented, it's the responsibility of a calling code that failed + * to complete bpf_link initialization. */ -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link_primer *primer) { - link->prog = NULL; - fput(link_file); - put_unused_fd(link_fd); + primer->link->prog = NULL; + bpf_link_free_id(primer->id); + fput(primer->file); + put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) @@ -2210,6 +2225,7 @@ void bpf_link_inc(struct bpf_link *link) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bpf_link_free_id(link->id); if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); @@ -2275,9 +2291,11 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" + "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", link_type, + link->id, prog_tag, prog->aux->id); } @@ -2292,36 +2310,76 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; -int bpf_link_new_fd(struct bpf_link *link) +static int bpf_link_alloc_id(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); -} + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&link_idr_lock); + id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); + spin_unlock_bh(&link_idr_lock); + idr_preload_end(); -/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but - * instead of immediately installing fd in fdtable, just reserve it and - * return. Caller then need to either install it with fd_install(fd, file) or - * release with put_unused_fd(fd). - * This is useful for cases when bpf_link attachment/detachment are - * complicated and expensive operations and should be delayed until all the fd - * reservation and anon_inode creation succeeds. + return id; +} + +/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, + * reserving unused FD and allocating ID from link_idr. This is to be paired + * with bpf_link_settle() to install FD and ID and expose bpf_link to + * user-space, if bpf_link is successfully attached. If not, bpf_link and + * pre-allocated resources are to be freed with bpf_cleanup() call. All the + * transient state is passed around in struct bpf_link_primer. + * This is preferred way to create and initialize bpf_link, especially when + * there are complicated and expensive operations inbetween creating bpf_link + * itself and attaching it to BPF hook. By using bpf_link_prime() and + * bpf_link_settle() kernel code using bpf_link doesn't have to perform + * expensive (and potentially failing) roll back operations in a rare case + * that file, FD, or ID can't be allocated. */ -struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; - int fd; + int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - return ERR_PTR(fd); + return fd; file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { put_unused_fd(fd); - return file; + return PTR_ERR(file); } - *reserved_fd = fd; - return file; + id = bpf_link_alloc_id(link); + if (id < 0) { + put_unused_fd(fd); + fput(file); + return id; + } + + primer->link = link; + primer->file = file; + primer->fd = fd; + primer->id = id; + return 0; +} + +int bpf_link_settle(struct bpf_link_primer *primer) +{ + /* make bpf_link fetchable by ID */ + spin_lock_bh(&link_idr_lock); + primer->link->id = primer->id; + spin_unlock_bh(&link_idr_lock); + /* make bpf_link fetchable by FD */ + fd_install(primer->fd, primer->file); + /* pass through installed FD */ + return primer->fd; +} + +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -2367,9 +2425,9 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_tracing_link *link; - struct file *link_file; - int link_fd, err; + int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -2404,22 +2462,19 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) } bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_prog; } err = bpf_trampoline_link_prog(prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_prog; } - fd_install(link_fd, link_file); - return link_fd; - + return bpf_link_settle(&link_primer); out_put_prog: bpf_prog_put(prog); return err; @@ -2447,7 +2502,7 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } -static const struct bpf_link_ops bpf_raw_tp_lops = { +static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, }; @@ -2456,13 +2511,13 @@ static const struct bpf_link_ops bpf_raw_tp_lops = { static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { + struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int link_fd, err; + int err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2515,24 +2570,22 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); link->btp = btp; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_btp; } err = bpf_probe_register(link->btp, prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_btp; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); @@ -3464,7 +3517,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; - if (link->ops == &bpf_raw_tp_lops) { + if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; -- cgit v1.2.3 From 2d602c8cf40d65d4a7ac34fe18648d8778e6e594 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:07 -0700 Subject: bpf: Support GET_FD_BY_ID and GET_NEXT_ID for bpf_link Add support to look up bpf_link by ID and iterate over all existing bpf_links in the system. GET_FD_BY_ID code handles not-yet-ready bpf_link by checking that its ID hasn't been set to non-zero value yet. Setting bpf_link's ID is done as the very last step in finalizing bpf_link, together with installing FD. This approach allows users of bpf_link in kernel code to not worry about races between user-space and kernel code that hasn't finished attaching and initializing bpf_link. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-4-andriin@fb.com --- kernel/bpf/syscall.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5439e05e3d25..1c213a730502 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3713,6 +3713,48 @@ out_put_link: return ret; } +static int bpf_link_inc_not_zero(struct bpf_link *link) +{ + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; +} + +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd, err; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&link_idr_lock); + link = idr_find(&link_idr, id); + /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + if (link) { + if (link->id) + err = bpf_link_inc_not_zero(link); + else + err = -EAGAIN; + } else { + err = -ENOENT; + } + spin_unlock_bh(&link_idr_lock); + + if (err) + return err; + + fd = bpf_link_new_fd(link); + if (fd < 0) + bpf_link_put(link); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3830,6 +3872,13 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_UPDATE: err = link_update(&attr); break; + case BPF_LINK_GET_FD_BY_ID: + err = bpf_link_get_fd_by_id(&attr); + break; + case BPF_LINK_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &link_idr, &link_idr_lock); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From f2e10bff16a0fdd41ba278c84da9813700e356af Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:08 -0700 Subject: bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link Add ability to fetch bpf_link details through BPF_OBJ_GET_INFO_BY_FD command. Also enhance show_fdinfo to potentially include bpf_link type-specific information (similarly to obj_info). Also introduce enum bpf_link_type stored in bpf_link itself and expose it in UAPI. bpf_link_tracing also now will store and return bpf_attach_type. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-5-andriin@fb.com --- kernel/bpf/btf.c | 2 + kernel/bpf/cgroup.c | 43 +++++++++++++- kernel/bpf/syscall.c | 155 +++++++++++++++++++++++++++++++++++++++++++------- kernel/bpf/verifier.c | 2 + 4 files changed, 181 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d65c6912bdaf..a2cfba89a8e1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ @@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = { 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1bdf37fca879..5c0e964105ac 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -833,10 +833,48 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } -const struct bpf_link_ops bpf_cgroup_link_lops = { +static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + seq_printf(seq, + "cgroup_id:\t%llu\n" + "attach_type:\t%d\n", + cg_id, + cg_link->type); +} + +static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + info->cgroup.cgroup_id = cg_id; + info->cgroup.attach_type = cg_link->type; + return 0; +} + +static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, .update_prog = cgroup_bpf_replace, + .show_fdinfo = bpf_cgroup_link_show_fdinfo, + .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -858,7 +896,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = -ENOMEM; goto out_put_cgroup; } - bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, + prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1c213a730502..d23c04cbe14f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -51,9 +51,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* @@ -1548,9 +1550,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) @@ -2183,10 +2187,11 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->type = type; link->id = 0; link->ops = ops; link->prog = prog; @@ -2266,27 +2271,23 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } -#ifdef CONFIG_PROC_FS -static const struct bpf_link_ops bpf_raw_tp_lops; -static const struct bpf_link_ops bpf_tracing_link_lops; +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, +static const char *bpf_link_type_strs[] = { + [BPF_LINK_TYPE_UNSPEC] = "", +#include +}; +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE +#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - const char *link_type; - - if (link->ops == &bpf_raw_tp_lops) - link_type = "raw_tracepoint"; - else if (link->ops == &bpf_tracing_link_lops) - link_type = "tracing"; -#ifdef CONFIG_CGROUP_BPF - else if (link->ops == &bpf_cgroup_link_lops) - link_type = "cgroup"; -#endif - else - link_type = "unknown"; bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -2294,10 +2295,12 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", - link_type, + bpf_link_type_strs[link->type], link->id, prog_tag, prog->aux->id); + if (link->ops->show_fdinfo) + link->ops->show_fdinfo(link, m); } #endif @@ -2403,6 +2406,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; + enum bpf_attach_type attach_type; }; static void bpf_tracing_link_release(struct bpf_link *link) @@ -2418,9 +2422,33 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) kfree(tr_link); } +static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + seq_printf(seq, + "attach_type:\t%d\n", + tr_link->attach_type); +} + +static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + info->tracing.attach_type = tr_link->attach_type; + + return 0; +} + static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, + .show_fdinfo = bpf_tracing_link_show_fdinfo, + .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) @@ -2460,7 +2488,9 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog); + link->attach_type = prog->expected_attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -2502,9 +2532,56 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } +static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + + seq_printf(seq, + "tp_name:\t%s\n", + raw_tp_link->btp->tp->name); +} + +static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); + const char *tp_name = raw_tp_link->btp->tp->name; + u32 ulen = info->raw_tracepoint.tp_name_len; + size_t tp_len = strlen(tp_name); + + if (ulen && !ubuf) + return -EINVAL; + + info->raw_tracepoint.tp_name_len = tp_len + 1; + + if (!ubuf) + return 0; + + if (ulen >= tp_len + 1) { + if (copy_to_user(ubuf, tp_name, tp_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, tp_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, + .show_fdinfo = bpf_raw_tp_link_show_fdinfo, + .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -2570,7 +2647,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog); link->btp = btp; err = bpf_link_prime(&link->link, &link_primer); @@ -3366,6 +3444,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } +static int bpf_link_get_info_by_fd(struct bpf_link *link, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_link_info info; + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + + info.type = link->type; + info.id = link->id; + info.prog_id = link->prog->aux->id; + + if (link->ops->fill_link_info) { + err = link->ops->fill_link_info(link, &info); + if (err) + return err; + } + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -3390,6 +3504,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &bpf_link_fops) + err = bpf_link_get_info_by_fd(f.file->private_data, + attr, uattr); else err = -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 91728e0f27eb..2b337e32aa94 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* bpf_check() is a static code analyzer that walks eBPF program -- cgit v1.2.3 From 9bf7c32409354b4a2fa3207d369a22c8233f718c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 25 Apr 2020 18:38:54 -0500 Subject: posix-cpu-timers: Extend rcu_read_lock removing task_struct references Now that the code stores of pid references it is no longer necessary or desirable to take a reference on task_struct in __get_task_for_clock. Instead extend the scope of rcu_read_lock and remove the reference counting on struct task_struct entirely. Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 43 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index b7f972fb115e..91996dd089a4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -81,36 +81,36 @@ static struct task_struct *lookup_task(const pid_t pid, bool thread, } static struct task_struct *__get_task_for_clock(const clockid_t clock, - bool getref, bool gettime) + bool gettime) { const bool thread = !!CPUCLOCK_PERTHREAD(clock); const pid_t pid = CPUCLOCK_PID(clock); - struct task_struct *p; if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) return NULL; - rcu_read_lock(); - p = lookup_task(pid, thread, gettime); - if (p && getref) - get_task_struct(p); - rcu_read_unlock(); - return p; + return lookup_task(pid, thread, gettime); } static inline struct task_struct *get_task_for_clock(const clockid_t clock) { - return __get_task_for_clock(clock, true, false); + return __get_task_for_clock(clock, false); } static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) { - return __get_task_for_clock(clock, true, true); + return __get_task_for_clock(clock, true); } static inline int validate_clock_permissions(const clockid_t clock) { - return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; + int ret; + + rcu_read_lock(); + ret = __get_task_for_clock(clock, false) ? 0 : -EINVAL; + rcu_read_unlock(); + + return ret; } static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) @@ -368,15 +368,18 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) struct task_struct *tsk; u64 t; + rcu_read_lock(); tsk = get_task_for_clock_get(clock); - if (!tsk) + if (!tsk) { + rcu_read_unlock(); return -EINVAL; + } if (CPUCLOCK_PERTHREAD(clock)) t = cpu_clock_sample(clkid, tsk); else t = cpu_clock_sample_group(clkid, tsk, false); - put_task_struct(tsk); + rcu_read_unlock(); *tp = ns_to_timespec64(t); return 0; @@ -389,19 +392,19 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { - struct task_struct *p = get_task_for_clock(new_timer->it_clock); + struct task_struct *p; - if (!p) + rcu_read_lock(); + p = get_task_for_clock(new_timer->it_clock); + if (!p) { + rcu_read_unlock(); return -EINVAL; + } new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); - /* - * get_task_for_clock() took a reference on @p. Drop it as the timer - * holds a reference on the pid of @p. - */ - put_task_struct(p); + rcu_read_unlock(); return 0; } -- cgit v1.2.3 From fece98260f31292dd468925c7582065bf6193212 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Apr 2020 09:38:29 -0500 Subject: posix-cpu-timers: Replace cpu_timer_pid_type with clock_pid_type Taking a clock and returning a pid_type is a more general and a superset of taking a timer and returning a pid_type. Perform this generalization so that future changes may use this code on clocks as well as timers. Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 91996dd089a4..42f673974d71 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -113,14 +113,14 @@ static inline int validate_clock_permissions(const clockid_t clock) return ret; } -static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) +static inline enum pid_type clock_pid_type(const clockid_t clock) { - return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID; + return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID; } static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) { - return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer)); + return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock)); } /* @@ -403,7 +403,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); + new_timer->it.cpu.pid = get_task_pid(p, clock_pid_type(new_timer->it_clock)); rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 964987738b3fe557cb1ee37acb4e7f85e29b7cea Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Apr 2020 07:55:07 -0500 Subject: posix-cpu-timers: Replace __get_task_for_clock with pid_for_clock Now that the codes store references to pids instead of referendes to tasks. Looking up a task for a clock instead of looking up a struct pid makes the code more difficult to verify it is correct than necessary. In posix_cpu_timers_create get_task_pid can race with release_task for threads and return a NULL pid. As put_pid and cpu_timer_task_rcu handle NULL pids just fine the code works without problems but it is an extra case to consider and keep in mind while verifying and modifying the code. There are races with de_thread to consider that only don't apply because thread clocks are only allowed for threads in the same thread_group. So instead of leaving a burden for people making modification to the code in the future return a rcu protected struct pid for the clock instead. The logic for __get_task_for_pid and lookup_task has been folded into the new function pid_for_clock with the only change being the logic has been modified from working on a task to working on a pid that will be returned. In posix_cpu_clock_get instead of calling pid_for_clock checking the result and then calling pid_task to get the task. The result of pid_for_clock is fed directly into pid_task. This is safe because pid_task handles NULL pids. As such an extra error check was unnecessary. Instead of hiding the flag that enables the special clock_gettime handling, I have made the 3 callers just pass the flag in themselves. That is less code and seems just as simple to work with as the wrapper functions. Historically the clock_gettime special case of allowing a process clock to be found by the thread id did not even exist [33ab0fec3352] but Thomas Gleixner reports that he has found code that uses that functionality [55e8c8eb2c7b]. Link: https://lkml.kernel.org/r/87zhaxqkwa.fsf@nanos.tec.linutronix.de/ Ref: 33ab0fec3352 ("posix-timers: Consolidate posix_cpu_clock_get()") Ref: 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-cpu-timers.c | 75 +++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 42f673974d71..165117996ea0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -47,59 +47,44 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) /* * Functions for validating access to tasks. */ -static struct task_struct *lookup_task(const pid_t pid, bool thread, - bool gettime) +static struct pid *pid_for_clock(const clockid_t clock, bool gettime) { - struct task_struct *p; + const bool thread = !!CPUCLOCK_PERTHREAD(clock); + const pid_t upid = CPUCLOCK_PID(clock); + struct pid *pid; + + if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) + return NULL; /* * If the encoded PID is 0, then the timer is targeted at current * or the process to which current belongs. */ - if (!pid) - return thread ? current : current->group_leader; + if (upid == 0) + return thread ? task_pid(current) : task_tgid(current); - p = find_task_by_vpid(pid); - if (!p) - return p; + pid = find_vpid(upid); + if (!pid) + return NULL; - if (thread) - return same_thread_group(p, current) ? p : NULL; + if (thread) { + struct task_struct *tsk = pid_task(pid, PIDTYPE_PID); + return (tsk && same_thread_group(tsk, current)) ? pid : NULL; + } /* - * For clock_gettime(PROCESS) the task does not need to be - * the actual group leader. task->signal gives - * access to the group's clock. + * For clock_gettime(PROCESS) allow finding the process by + * with the pid of the current task. The code needs the tgid + * of the process so that pid_task(pid, PIDTYPE_TGID) can be + * used to find the process. */ - if (gettime && (p == current)) - return p; + if (gettime && (pid == task_pid(current))) + return task_tgid(current); /* - * For processes require that p is group leader. + * For processes require that pid identifies a process. */ - return thread_group_leader(p) ? p : NULL; -} - -static struct task_struct *__get_task_for_clock(const clockid_t clock, - bool gettime) -{ - const bool thread = !!CPUCLOCK_PERTHREAD(clock); - const pid_t pid = CPUCLOCK_PID(clock); - - if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) - return NULL; - - return lookup_task(pid, thread, gettime); -} - -static inline struct task_struct *get_task_for_clock(const clockid_t clock) -{ - return __get_task_for_clock(clock, false); -} - -static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) -{ - return __get_task_for_clock(clock, true); + return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL; } static inline int validate_clock_permissions(const clockid_t clock) @@ -107,7 +92,7 @@ static inline int validate_clock_permissions(const clockid_t clock) int ret; rcu_read_lock(); - ret = __get_task_for_clock(clock, false) ? 0 : -EINVAL; + ret = pid_for_clock(clock, false) ? 0 : -EINVAL; rcu_read_unlock(); return ret; @@ -369,7 +354,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) u64 t; rcu_read_lock(); - tsk = get_task_for_clock_get(clock); + tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock)); if (!tsk) { rcu_read_unlock(); return -EINVAL; @@ -392,18 +377,18 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { - struct task_struct *p; + struct pid *pid; rcu_read_lock(); - p = get_task_for_clock(new_timer->it_clock); - if (!p) { + pid = pid_for_clock(new_timer->it_clock, false); + if (!pid) { rcu_read_unlock(); return -EINVAL; } new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.pid = get_task_pid(p, clock_pid_type(new_timer->it_clock)); + new_timer->it.cpu.pid = get_pid(pid); rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 64d85290d79c0677edb5a8ee2295b36c022fa5df Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:52 +0200 Subject: bpf: Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH White-list map lookup for SOCKMAP/SOCKHASH from BPF. Lookup returns a pointer to a full socket and acquires a reference if necessary. To support it we need to extend the verifier to know that: (1) register storing the lookup result holds a pointer to socket, if lookup was done on SOCKMAP/SOCKHASH, and that (2) map lookup on SOCKMAP/SOCKHASH is a reference acquiring operation, which needs a corresponding reference release with bpf_sk_release. On sock_map side, lookup handlers exposed via bpf_map_ops now bump sk_refcnt if socket is reference counted. In turn, bpf_sk_select_reuseport, the only in-kernel user of SOCKMAP/SOCKHASH ops->map_lookup_elem, was updated to release the reference. Sockets fetched from a map can be used in the same way as ones returned by BPF socket lookup helpers, such as bpf_sk_lookup_tcp. In particular, they can be used with bpf_sk_assign to direct packets toward a socket on TC ingress path. Suggested-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-2-jakub@cloudflare.com --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2b337e32aa94..70ad009577f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -429,11 +429,30 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } -static bool is_acquire_function(enum bpf_func_id func_id) +static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp; + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_map_lookup_elem; +} + +static bool is_acquire_function(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC; + + if (func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp) + return true; + + if (func_id == BPF_FUNC_map_lookup_elem && + (map_type == BPF_MAP_TYPE_SOCKMAP || + map_type == BPF_MAP_TYPE_SOCKHASH)) + return true; + + return false; } static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -3934,7 +3953,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -3942,7 +3962,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4112,7 +4133,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) /* A reference acquiring function cannot acquire * another refcounted ptr. */ - if (is_acquire_function(func_id) && count) + if (may_be_acquire_function(func_id) && count) return false; /* We only support one arg being unreferenced at the moment, @@ -4623,7 +4644,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id)) { + } else if (is_acquire_function(func_id, meta.map_ptr)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) @@ -6532,12 +6553,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (is_null) { reg->type = SCALAR_VALUE; } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (reg->map_ptr->inner_map_meta) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else if (reg->map_ptr->map_type == - BPF_MAP_TYPE_XSKMAP) { + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; } else { reg->type = PTR_TO_MAP_VALUE; } -- cgit v1.2.3 From 449e14bfdb83bf772200840a7ac4dcc1d7cacf54 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 29 Apr 2020 15:21:58 +0200 Subject: bpf: Fix unused variable warning Hiding the only using of bpf_link_type_strs[] in an #ifdef causes an unused-variable warning: kernel/bpf/syscall.c:2280:20: error: 'bpf_link_type_strs' defined but not used [-Werror=unused-variable] 2280 | static const char *bpf_link_type_strs[] = { Move the definition into the same #ifdef. Fixes: f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200429132217.1294289-1-arnd@arndb.de --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d23c04cbe14f..c75b2dd2459c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2271,6 +2271,7 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) [_id] = #_name, @@ -2282,7 +2283,6 @@ static const char *bpf_link_type_strs[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; -- cgit v1.2.3 From 3c2214b6027ff37945799de717c417212e1a8c54 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 21 Apr 2020 12:34:55 -0400 Subject: padata: add separate cpuhp node for CPUHP_PADATA_DEAD Removing the pcrypt module triggers this: general protection fault, probably for non-canonical address 0xdead000000000122 CPU: 5 PID: 264 Comm: modprobe Not tainted 5.6.0+ #2 Hardware name: QEMU Standard PC RIP: 0010:__cpuhp_state_remove_instance+0xcc/0x120 Call Trace: padata_sysfs_release+0x74/0xce kobject_put+0x81/0xd0 padata_free+0x12/0x20 pcrypt_exit+0x43/0x8ee [pcrypt] padata instances wrongly use the same hlist node for the online and dead states, so __padata_free()'s second cpuhp remove call chokes on the node that the first poisoned. cpuhp multi-instance callbacks only walk forward in cpuhp_step->list and the same node is linked in both the online and dead lists, so the list corruption that results from padata_alloc() adding the node to a second list without removing it from the first doesn't cause problems as long as no instances are freed. Avoid the issue by giving each state its own node. Fixes: 894c9ef9780c ("padata: validate cpumask without removed CPU during offline") Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Herbert Xu --- kernel/padata.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index a6afa12fb75e..aae789896616 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -703,7 +703,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -718,7 +718,7 @@ static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -734,8 +734,9 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); - cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, + &pinst->cpu_dead_node); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node); #endif WARN_ON(!list_empty(&pinst->pslist)); @@ -939,9 +940,10 @@ static struct padata_instance *padata_alloc(const char *name, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, + &pinst->cpu_online_node); cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, - &pinst->node); + &pinst->cpu_dead_node); #endif put_online_cpus(); -- cgit v1.2.3 From 1dd694a1b72f69a3add938f4b5bfb4cf9914b133 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Apr 2020 12:19:04 +0200 Subject: remove the no longer needed pid_alive() check in __task_pid_nr_ns() Starting from 2c4704756cab ("pids: Move the pgrp and session pid pointers from task_struct to signal_struct") __task_pid_nr_ns() doesn't dereference task->group_leader, we can remove the pid_alive() check. pid_nr_ns() has to check pid != NULL anyway, pid_alive() just adds the unnecessary confusion. Signed-off-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Acked-by: Christian Brauner Signed-off-by: Eric W. Biederman --- kernel/pid.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 6d5d0a5bda82..f1496b757162 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -495,8 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; -- cgit v1.2.3 From 7f645462ca01d01abb94d75e6768c8b3ed3a188b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 30 Apr 2020 08:18:51 +0000 Subject: bpf: Fix error return code in map_lookup_and_delete_elem() Fix to return negative error code -EFAULT from the copy_to_user() error handling case instead of 0, as done elsewhere in this function. Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Wei Yongjun Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430081851.166996-1-weiyongjun1@huawei.com --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7626b8024471..2843bbba9ca1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1485,8 +1485,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (err) goto free_value; - if (copy_to_user(uvalue, value, value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) { + err = -EFAULT; goto free_value; + } err = 0; -- cgit v1.2.3 From 2ed6edd33a214bca02bd2b45e3fc3038a059436b Mon Sep 17 00:00:00 2001 From: Barret Rhoden Date: Tue, 14 Apr 2020 18:29:20 -0400 Subject: perf: Add cond_resched() to task_function_call() Under rare circumstances, task_function_call() can repeatedly fail and cause a soft lockup. There is a slight race where the process is no longer running on the cpu we targeted by the time remote_function() runs. The code will simply try again. If we are very unlucky, this will continue to fail, until a watchdog fires. This can happen in a heavily loaded, multi-core virtual machine. Reported-by: syzbot+bb4935a5c09b5ff79940@syzkaller.appspotmail.com Signed-off-by: Barret Rhoden Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200414222920.121401-1-brho@google.com --- kernel/events/core.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 52951e9e8e1b..80cf996a7f19 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -95,11 +95,11 @@ static void remote_function(void *data) * @info: the function call argument * * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly + * be on the current CPU, which just calls the function directly. This will + * retry due to any failures in smp_call_function_single(), such as if the + * task_cpu() goes offline concurrently. * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away + * returns @func return value or -ESRCH when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -112,11 +112,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) }; int ret; - do { - ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - if (!ret) - ret = data.ret; - } while (ret == -EAGAIN); + for (;;) { + ret = smp_call_function_single(task_cpu(p), remote_function, + &data, 1); + ret = !ret ? data.ret : -EAGAIN; + + if (ret != -EAGAIN) + break; + + cond_resched(); + } return ret; } -- cgit v1.2.3 From f080d93e1d419099a99d7473ed532289ca8dc717 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 14 Apr 2020 20:57:21 +0800 Subject: sched/debug: Fix trival print_task() format Ensure leave one space between state and task name. w/o patch: runnable tasks: S task PID tree-key switches prio wait Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200414125721.195801-1-xiexiuqi@huawei.com --- kernel/sched/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a562df57a86e..b3ac1c10b032 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), @@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n"); SEQ_printf(m, "-------------------------------------------------------" - "----------------------------------------------------\n"); + "------------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { -- cgit v1.2.3 From e98fa02c4f2ea4991dae422ac7e34d102d2f0599 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 10 Apr 2020 15:52:07 -0700 Subject: sched/fair: Eliminate bandwidth race between throttling and distribution There is a race window in which an entity begins throttling before quota is added to the pool, but does not finish throttling until after we have finished with distribute_cfs_runtime(). This entity is not observed by distribute_cfs_runtime() because it was not on the throttled list at the time that distribution was running. This race manifests as rare period-length statlls for such entities. Rather than heavy-weight the synchronization with the progress of distribution, we can fix this by aborting throttling if bandwidth has become available. Otherwise, we immediately add the entity to the throttled list so that it can be observed by a subsequent distribution. Additionally, we can remove the case of adding the throttled entity to the head of the throttled list, and simply always add to the tail. Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto its own pool of runtime. This means that if we do hit the !assign and distribute_running case, we know that distribution is about to end. Signed-off-by: Paul Turner Signed-off-by: Ben Segall Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200410225208.109717-2-joshdon@google.com --- kernel/sched/fair.c | 79 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..0c13a41bde81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4588,16 +4588,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -4609,13 +4609,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; } +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ @@ -4704,13 +4716,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; - bool empty; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4744,29 +4776,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) sub_nr_running(rq, task_delta); - cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_b->throttled_cfs_rq); - - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. - */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -5121,8 +5137,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) -- cgit v1.2.3 From ab93a4bc955b3980c699430bc0b633f0d8b607be Mon Sep 17 00:00:00 2001 From: Josh Don Date: Fri, 10 Apr 2020 15:52:08 -0700 Subject: sched/fair: Remove distribute_running from CFS bandwidth This is mostly a revert of commit: baa9be4ffb55 ("sched/fair: Fix throttle_list starvation with low CFS quota") The primary use of distribute_running was to determine whether to add throttled entities to the head or the tail of the throttled list. Now that we always add to the tail, we can remove this field. The other use of distribute_running is in the slack_timer, so that we don't start a distribution while one is already running. However, even in the event that this race occurs, it is fine to have two distributions running (especially now that distribute grabs the cfs_b->lock to determine remaining quota before assigning). Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Tested-by: Phil Auld Link: https://lkml.kernel.org/r/20200410225208.109717-3-joshdon@google.com --- kernel/sched/fair.c | 13 +------------ kernel/sched/sched.h | 1 - 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0c13a41bde81..3d6ce751abb6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4931,14 +4931,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* * This check is repeated as we release cfs_b->lock while we unthrottle. */ - while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - cfs_b->distribute_running = 1; + while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); } @@ -5052,10 +5050,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->slack_started = false; - if (cfs_b->distribute_running) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); @@ -5065,9 +5059,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - if (runtime) - cfs_b->distribute_running = 1; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) @@ -5076,7 +5067,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -5218,7 +5208,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; - cfs_b->distribute_running = 0; cfs_b->slack_started = false; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db3a57675ccf..7198683b2869 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -349,7 +349,6 @@ struct cfs_bandwidth { u8 idle; u8 period_active; - u8 distribute_running; u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; -- cgit v1.2.3 From 64297f2b03cc7a6d0184a435f1b296beca1bedd1 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Sat, 11 Apr 2020 17:20:20 +0800 Subject: sched/fair: Simplify the code of should_we_balance() We only consider group_balance_cpu() after there is no idle cpu. So, just do comparison before return at these two cases. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/245c792f0e580b3ca342ad61257f4c066ee0f84f.1586594833.git.rocking@linux.alibaba.com --- kernel/sched/fair.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3d6ce751abb6..63419c6d9641 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9413,7 +9413,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; + int cpu; /* * Ensure the balancing environment is consistent; can happen @@ -9434,18 +9434,12 @@ static int should_we_balance(struct lb_env *env) if (!idle_cpu(cpu)) continue; - balance_cpu = cpu; - break; + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; } - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; } /* -- cgit v1.2.3 From 586b58cac8b4683eb58a1446fbc399de18974e40 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 5 Mar 2020 23:06:57 +0100 Subject: exit: Move preemption fixup up, move blocking operations down With CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_CGROUPS=y, kernel oopses in non-preemptible context look untidy; after the main oops, the kernel prints a "sleeping function called from invalid context" report because exit_signals() -> cgroup_threadgroup_change_begin() -> percpu_down_read() can sleep, and that happens before the preempt_count_set(PREEMPT_ENABLED) fixup. It looks like the same thing applies to profile_task_exit() and kcov_task_exit(). Fix it by moving the preemption fixup up and the calls to profile_task_exit() and kcov_task_exit() down. Fixes: 1dc0fffc48af ("sched/core: Robustify preemption leak checks") Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200305220657.46800-1-jannh@google.com --- kernel/exit.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ce2a75bc0ade..d56fe51bdf07 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -708,8 +708,12 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - profile_task_exit(tsk); - kcov_task_exit(tsk); + /* + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ WARN_ON(blk_needs_flush_plug(tsk)); @@ -727,6 +731,16 @@ void __noreturn do_exit(long code) */ set_fs(USER_DS); + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + profile_task_exit(tsk); + kcov_task_exit(tsk); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -744,13 +758,6 @@ void __noreturn do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); -- cgit v1.2.3 From 45da27732b0b9b7a04696653065d5e6037bc5ac0 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:04 +0100 Subject: sched/fair: find_idlest_group(): Remove unused sd_flag parameter The last use of that parameter was removed by commit 57abff067a08 ("sched/fair: Rework find_idlest_group()") Get rid of the parameter. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200415210512.805-2-valentin.schneider@arm.com --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 63419c6d9641..617ca44ed61b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5825,8 +5825,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag); +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5909,7 +5908,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -8681,8 +8680,7 @@ static bool update_pick_idlest(struct sched_group *idlest, * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; -- cgit v1.2.3 From 9818427c6270a9ce8c52c8621026fe9cebae0f92 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:05 +0100 Subject: sched/debug: Make sd->flags sysctl read-only Writing to the sysctl of a sched_domain->flags directly updates the value of the field, and goes nowhere near update_top_cache_domain(). This means that the cached domain pointers can end up containing stale data (e.g. the domain pointed to doesn't have the relevant flag set anymore). Explicit domain walks that check for flags will be affected by the write, but this won't be in sync with the cached pointers which will still point to the domains that were cached at the last sched_domain build. In other words, writing to this interface is playing a dangerous game. It could be made to trigger an update of the cached sched_domain pointers when written to, but this does not seem to be worth the trouble. Make it read-only. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200415210512.805-3-valentin.schneider@arm.com --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b3ac1c10b032..c6cc02a6aad0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ -- cgit v1.2.3 From e669ac8ab952df2f07dee1e1efbf40647d6de332 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:06 +0100 Subject: sched: Remove checks against SD_LOAD_BALANCE The SD_LOAD_BALANCE flag is set unconditionally for all domains in sd_init(). By making the sched_domain->flags syctl interface read-only, we have removed the last piece of code that could clear that flag - as such, it will now be always present. Rather than to keep carrying it along, we can work towards getting rid of it entirely. cpusets don't need it because they can make CPUs be attached to the NULL domain (e.g. cpuset with sched_load_balance=0), or to a partitioned root_domain, i.e. a sched_domain hierarchy that doesn't span the entire system (e.g. root cpuset with sched_load_balance=0 and sibling cpusets with sched_load_balance=1). isolcpus apply the same "trick": isolated CPUs are explicitly taken out of the sched_domain rebuild (using housekeeping_cpumask()), so they get the NULL domain treatment as well. Remove the checks against SD_LOAD_BALANCE. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200415210512.805-4-valentin.schneider@arm.com --- kernel/sched/fair.c | 14 ++------------ kernel/sched/topology.c | 28 +++++++++------------------- 2 files changed, 11 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 617ca44ed61b..4b959c0a7d7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6649,9 +6649,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_lock(); for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - break; - /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -9790,9 +9787,8 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; } if (likely(sd)) { @@ -9881,9 +9877,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -10472,9 +10465,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int continue_balancing = 1; u64 t0, domain_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); break; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..a9dc34a0ebc1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); - return -1; - } - printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); @@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | + if (sd->flags & (SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | @@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + pflags &= ~(SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } -- cgit v1.2.3 From 36c5bdc4387056af3840adb4478c752faeb9d15e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:07 +0100 Subject: sched/topology: Kill SD_LOAD_BALANCE That flag is set unconditionally in sd_init(), and no one checks for it anymore. Remove it. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200415210512.805-5-valentin.schneider@arm.com --- kernel/sched/topology.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a9dc34a0ebc1..1d7b446fac7d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1341,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl, .cache_nice_tries = 0, - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE + .flags = 1*SD_BALANCE_NEWIDLE | 1*SD_BALANCE_EXEC | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE -- cgit v1.2.3 From d91cecc156620ec75d94c55369509c807c3d07e6 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 21 Apr 2020 18:50:34 +0800 Subject: sched: Make newidle_balance() static again After Commit 6e2df0581f56 ("sched: Fix pick_next_task() vs 'change' pattern race"), there is no need to expose newidle_balance() as it is only used within fair.c file. Change this function back to static again. No functional change. Reported-by: kbuild test robot Suggested-by: Peter Zijlstra Signed-off-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/83cd3030b031ca5d646cd5e225be10e7a0fdd8f5.1587464698.git.yu.c.chen@intel.com --- kernel/sched/fair.c | 6 ++++-- kernel/sched/sched.h | 4 ---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b959c0a7d7b..c0216ef33c0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3873,6 +3873,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -4054,7 +4056,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -10414,7 +10416,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7198683b2869..978c6fac8cb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1503,14 +1503,10 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); - #else static inline void sched_ttwu_pending(void) { } -static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } - #endif /* CONFIG_SMP */ #include "stats.h" -- cgit v1.2.3 From 457d1f465778ccb5f14f7d7a62245e41d12a3804 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 21 Apr 2020 18:50:43 +0800 Subject: sched: Extract the task putting code from pick_next_task() Introduce a new function put_prev_task_balance() to do the balance when necessary, and then put previous task back to the run queue. This function is extracted from pick_next_task() to prepare for future usage by other type of task picking logic. No functional change. Suggested-by: Peter Zijlstra Signed-off-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/5a99860cf66293db58a397d6248bcb2eee326776.1587464698.git.yu.c.chen@intel.com --- kernel/sched/core.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..2e6ba9e301f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3899,6 +3899,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) schedstat_inc(this_rq()->sched_count); } +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + const struct sched_class *class; + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ @@ -3932,22 +3954,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: -#ifdef CONFIG_SMP - /* - * We must do the balancing pass before put_next_task(), such - * that when we release the rq->lock the task is in the same - * state as before we took rq->lock. - * - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) - break; - } -#endif - - put_prev_task(rq, prev); + put_prev_task_balance(rq, prev, rf); for_each_class(class) { p = class->pick_next_task(rq); -- cgit v1.2.3 From 5a6d6a6ccb5f48ca8cf7c6d64ff83fd9c7999390 Mon Sep 17 00:00:00 2001 From: Huaixin Chang Date: Mon, 20 Apr 2020 10:44:21 +0800 Subject: sched/fair: Refill bandwidth before scaling In order to prevent possible hardlockup of sched_cfs_period_timer() loop, loop count is introduced to denote whether to scale quota and period or not. However, scale is done between forwarding period timer and refilling cfs bandwidth runtime, which means that period timer is forwarded with old "period" while runtime is refilled with scaled "quota". Move do_sched_cfs_period_timer() before scaling to solve this. Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup") Signed-off-by: Huaixin Chang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200420024421.22442-3-changhuaixin@linux.alibaba.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0216ef33c0c..fac5b2f85247 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5159,6 +5159,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -5188,8 +5190,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; -- cgit v1.2.3 From f38f12d1e0811c0ee59260b2bdadedf99e16c4af Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 6 Apr 2020 15:47:50 +0800 Subject: sched/fair: Mark sched_init_granularity __init Function sched_init_granularity() is only called from __init functions, so mark it __init as well. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20200406074750.56533-1-songmuchun@bytedance.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fac5b2f85247..cd7fd7e2b579 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -191,7 +191,7 @@ static void update_sysctl(void) #undef SET_SYSCTL } -void sched_init_granularity(void) +void __init sched_init_granularity(void) { update_sysctl(); } -- cgit v1.2.3 From bf2c59fce4074e55d622089b34be3a6bc95484fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 17:40:33 -0400 Subject: sched/core: Fix illegal RCU from offline CPUs In the CPU-offline process, it calls mmdrop() after idle entry and the subsequent call to cpuhp_report_idle_dead(). Once execution passes the call to rcu_report_dead(), RCU is ignoring the CPU, which results in lockdep complaining when mmdrop() uses RCU from either memcg or debugobjects below. Fix it by cleaning up the active_mm state from BP instead. Every arch which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit() from AP. The only exception is parisc because it switches them to &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()), but the patch will still work there because it calls mmgrab(&init_mm) in smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu(). WARNING: suspicious RCU usage ----------------------------- kernel/workqueue.c:710 RCU or wq_pool_mutex should be held! other info that might help us debug this: RCU used illegally from offline CPU! Call Trace: dump_stack+0xf4/0x164 (unreliable) lockdep_rcu_suspicious+0x140/0x164 get_work_pool+0x110/0x150 __queue_work+0x1bc/0xca0 queue_work_on+0x114/0x120 css_release+0x9c/0xc0 percpu_ref_put_many+0x204/0x230 free_pcp_prepare+0x264/0x570 free_unref_page+0x38/0xf0 __mmdrop+0x21c/0x2c0 idle_task_exit+0x170/0x1b0 pnv_smp_cpu_kill_self+0x38/0x2e0 cpu_die+0x48/0x64 arch_cpu_idle_dead+0x30/0x50 do_idle+0x2f4/0x470 cpu_startup_entry+0x38/0x40 start_secondary+0x7a8/0xa80 start_secondary_resume+0x10/0x14 Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman (powerpc) Link: https://lkml.kernel.org/r/20200401214033.8448-1-cai@lca.pw --- kernel/cpu.c | 18 +++++++++++++++++- kernel/sched/core.c | 5 +++-- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..244d30544377 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3,6 +3,7 @@ * * This code is licenced under the GPL. */ +#include #include #include #include @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu) return bringup_wait_for_ap(cpu); } +static int finish_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + struct mm_struct *mm = idle->active_mm; + + /* + * idle_task_exit() will have switched to &init_mm, now + * clean up any remaining active_mm state. + */ + if (mm != &init_mm) + idle->active_mm = &init_mm; + mmdrop(mm); + return 0; +} + /* * Hotplug state machine related functions */ @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, - .teardown.single = NULL, + .teardown.single = finish_cpu, .cant_stop = true, }, /* Final state before CPU kills itself */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e6ba9e301f0..f6ae2629f4e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6197,13 +6197,14 @@ void idle_task_exit(void) struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); + BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; finish_arch_post_lock_switch(); } - mmdrop(mm); + + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } /* -- cgit v1.2.3 From 17c891ab349138e8d8a59ca2700f42ce8af96f4e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 21 Apr 2020 22:41:23 +0800 Subject: sched/fair: Use __this_cpu_read() in wake_wide() The code is executed with preemption(and interrupts) disabled, so it's safe to use __this_cpu_write(). Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421144123.33580-1-songmuchun@bytedance.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cd7fd7e2b579..46b7bd41573f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5718,7 +5718,7 @@ static int wake_wide(struct task_struct *p) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int factor = __this_cpu_read(sd_llc_size); if (master < slave) swap(master, slave); -- cgit v1.2.3 From b1d1779e5ef7a60b192b61fd97201f322e1e9303 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 23 Apr 2020 21:44:43 +0000 Subject: sched/core: Simplify sched_init() Currently root_task_group.shares and cfs_bandwidth are initialized for each online cpu, which not necessary. Let's take it out to do it only once. Signed-off-by: Wei Yang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200423214443.29994-1-richard.weiyang@gmail.com --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f6ae2629f4e1..b58efb1156eb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6597,6 +6597,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -6649,7 +6651,6 @@ void __init sched_init(void) init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* @@ -6671,7 +6672,6 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From 41cd780524674082b037e7c8461f90c5e42103f0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 3 Apr 2020 07:20:51 +0000 Subject: uaccess: Selectively open read or write user access When opening user access to only perform reads, only open read access. When opening user access to only perform writes, only open write access. Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2e73bc57125c2c6ab12a587586a4eed3a47105fc.1585898438.git.christophe.leroy@c-s.fr --- kernel/compat.c | 12 ++++++------ kernel/exit.c | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 843dd17e6078..b8d2800bb4b7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -199,7 +199,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_read_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -211,11 +211,11 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, } if (nr_compat_longs) unsafe_get_user(*mask, umask++, Efault); - user_access_end(); + user_read_access_end(); return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -228,7 +228,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_write_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -239,10 +239,10 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } if (nr_compat_longs) unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); - user_access_end(); + user_write_access_end(); return 0; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } diff --git a/kernel/exit.c b/kernel/exit.c index 389a88cb3081..2d97cbba512d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1557,7 +1557,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1566,10 +1566,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } @@ -1684,7 +1684,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1693,10 +1693,10 @@ COMPAT_SYSCALL_DEFINE5(waitid, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } #endif -- cgit v1.2.3 From db9ff6ecf6efa6ffc45bfd5dc8d5708cfe5e89cb Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Wed, 29 Apr 2020 17:26:48 +0800 Subject: audit: make symbol 'audit_nfcfgs' static Fix sparse warnings: kernel/auditsc.c:138:32: warning: symbol 'audit_nfcfgs' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Zheng Bin Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d281c18d1771..cfe3486e5f31 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -135,7 +135,7 @@ struct audit_nfcfgop_tab { const char *s; }; -const struct audit_nfcfgop_tab audit_nfcfgs[] = { +static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_XT_OP_REGISTER, "register" }, { AUDIT_XT_OP_REPLACE, "replace" }, { AUDIT_XT_OP_UNREGISTER, "unregister" }, -- cgit v1.2.3 From d46edd671a147032e22cfeb271a5734703093649 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:04 -0700 Subject: bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats. Typical userspace tools use kernel.bpf_stats_enabled as follows: 1. Enable kernel.bpf_stats_enabled; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Disable kernel.bpf_stats_enabled. The problem with this approach is that only one userspace tool can toggle this sysctl. If multiple tools toggle the sysctl at the same time, the measurement may be inaccurate. To fix this problem while keep backward compatibility, introduce a new bpf command BPF_ENABLE_STATS. On success, this command enables stats and returns a valid fd. BPF_ENABLE_STATS takes argument "type". Currently, only one type, BPF_STATS_RUN_TIME, is supported. We can extend the command to support other types of stats in the future. With BPF_ENABLE_STATS, user space tool would have the following flow: 1. Get a fd with BPF_ENABLE_STATS, and make sure it is valid; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Close the fd. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200430071506.1408910-2-songliubraving@fb.com --- kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 36 ++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c75b2dd2459c..4f34eecec9ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3872,6 +3872,60 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) return fd; } +DEFINE_MUTEX(bpf_stats_enabled_mutex); + +static int bpf_stats_release(struct inode *inode, struct file *file) +{ + mutex_lock(&bpf_stats_enabled_mutex); + static_key_slow_dec(&bpf_stats_enabled_key.key); + mutex_unlock(&bpf_stats_enabled_mutex); + return 0; +} + +static const struct file_operations bpf_stats_fops = { + .release = bpf_stats_release, +}; + +static int bpf_enable_runtime_stats(void) +{ + int fd; + + mutex_lock(&bpf_stats_enabled_mutex); + + /* Set a very high limit to avoid overflow */ + if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { + mutex_unlock(&bpf_stats_enabled_mutex); + return -EBUSY; + } + + fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); + if (fd >= 0) + static_key_slow_inc(&bpf_stats_enabled_key.key); + + mutex_unlock(&bpf_stats_enabled_mutex); + return fd; +} + +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type + +static int bpf_enable_stats(union bpf_attr *attr) +{ + + if (CHECK_ATTR(BPF_ENABLE_STATS)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (attr->enable_stats.type) { + case BPF_STATS_RUN_TIME: + return bpf_enable_runtime_stats(); + default: + break; + } + return -EINVAL; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3996,6 +4050,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &link_idr, &link_idr_lock); break; + case BPF_ENABLE_STATS: + err = bpf_enable_stats(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e961286d0e14..7adfe5dbce9d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,6 +201,40 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_BPF_SYSCALL +static int bpf_stats_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} +#endif + /* * /proc/sys support */ @@ -2549,7 +2583,7 @@ static struct ctl_table kern_table[] = { .data = &bpf_stats_enabled_key.key, .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_do_static_key, + .proc_handler = bpf_stats_handler, }, #endif #if defined(CONFIG_TREE_RCU) -- cgit v1.2.3 From 138c67677ff5ac0bce7131033c39d52a81e87a60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 1 May 2020 11:56:22 -0700 Subject: bpf: Fix use-after-free of bpf_link when priming half-fails If bpf_link_prime() succeeds to allocate new anon file, but then fails to allocate ID for it, link priming is considered to be failed and user is supposed ot be able to directly kfree() bpf_link, because it was never exposed to user-space. But at that point file already keeps a pointer to bpf_link and will eventually call bpf_link_release(), so if bpf_link was kfree()'d by caller, that would lead to use-after-free. Fix this by first allocating ID and only then allocating file. Adding ID to link_idr is ok, because link at that point still doesn't have its ID set, so no user-space process can create a new FD for it. Fixes: a3b80e107894 ("bpf: Allocate ID for bpf_link") Reported-by: syzbot+39b64425f91b5aab714d@syzkaller.appspotmail.com Suggested-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200501185622.3088964-1-andriin@fb.com --- kernel/bpf/syscall.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4f34eecec9ce..bb1ab7da6103 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2348,19 +2348,20 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) if (fd < 0) return fd; - file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } id = bpf_link_alloc_id(link); if (id < 0) { put_unused_fd(fd); - fput(file); return id; } + file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); + if (IS_ERR(file)) { + bpf_link_free_id(id); + put_unused_fd(fd); + return PTR_ERR(file); + } + primer->link = link; primer->file = file; primer->fd = fd; -- cgit v1.2.3 From f187b6974f6dfbeba4aafda972cc37f27d091b73 Mon Sep 17 00:00:00 2001 From: Sean Fu Date: Wed, 29 Apr 2020 12:04:13 +0800 Subject: workqueue: Use IS_ERR and PTR_ERR instead of PTR_ERR_OR_ZERO. Replace inline function PTR_ERR_OR_ZERO with IS_ERR and PTR_ERR to remove redundant parameter definitions and checks. Reduce code size. Before: text data bss dec hex filename 47510 5979 840 54329 d439 kernel/workqueue.o After: text data bss dec hex filename 47474 5979 840 54293 d415 kernel/workqueue.o Signed-off-by: Sean Fu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 891ccad5f271..ddf0537dce14 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4197,7 +4197,6 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; - int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; @@ -4208,10 +4207,9 @@ static int init_rescuer(struct workqueue_struct *wq) rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - ret = PTR_ERR_OR_ZERO(rescuer->task); - if (ret) { + if (IS_ERR(rescuer->task)) { kfree(rescuer); - return ret; + return PTR_ERR(rescuer->task); } wq->rescuer = rescuer; -- cgit v1.2.3 From 5447e8e01e101ba19fe5b7551f02d37367156f6b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 May 2020 16:07:12 +0200 Subject: sysctl: Fix unused function warning The newly added bpf_stats_handler function has the wrong #ifdef check around it, leading to an unused-function warning when CONFIG_SYSCTL is disabled: kernel/sysctl.c:205:12: error: unused function 'bpf_stats_handler' [-Werror,-Wunused-function] static int bpf_stats_handler(struct ctl_table *table, int write, Fix the check to match the reference. Fixes: d46edd671a14 ("bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS") Signed-off-by: Arnd Bergmann Signed-off-by: Alexei Starovoitov Reviewed-by: Luis Chamberlain Acked-by: Martin KaFai Lau Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200505140734.503701-1-arnd@arndb.de --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7adfe5dbce9d..17c7633d90fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,7 +201,7 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ -#ifdef CONFIG_BPF_SYSCALL +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3 From c3b3f52476412a3899f2c65b220075aceb18dd2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2020 12:12:53 +0200 Subject: signal: refactor copy_siginfo_to_user32 Factor out a copy_siginfo_to_external32 helper from copy_siginfo_to_user32 that fills out the compat_siginfo, but does so on a kernel space data structure. With that we can let architectures override copy_siginfo_to_user32 with their own implementations using copy_siginfo_to_external32. That allows moving the x32 SIGCHLD purely to x86 architecture code. As a nice side effect copy_siginfo_to_external32 also comes in handy for avoiding a set_fs() call in the coredump code later on. Contains improvements from Eric W. Biederman and Arnd Bergmann . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/signal.c | 106 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e58a6c619824..2bc9458d40bd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3235,94 +3235,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) } #ifdef CONFIG_COMPAT -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from, bool x32_ABI) -#endif +/** + * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo + * @to: compat siginfo destination + * @from: kernel siginfo source + * + * Note: This function does not work properly for the SIGCHLD on x32, but + * fortunately it doesn't have to. The only valid callers for this function are + * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. + * The latter does not care because SIGCHLD will never cause a coredump. + */ +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from) { - struct compat_siginfo new; - memset(&new, 0, sizeof(new)); + memset(to, 0, sizeof(*to)); - new.si_signo = from->si_signo; - new.si_errno = from->si_errno; - new.si_code = from->si_code; + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - new.si_tid = from->si_tid; - new.si_overrun = from->si_overrun; - new.si_int = from->si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - new.si_band = from->si_band; - new.si_fd = from->si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_addr_lsb = from->si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); + to->si_lower = ptr_to_compat(from->si_lower); + to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_pkey = from->si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_status = from->si_status; -#ifdef CONFIG_X86_X32_ABI - if (x32_ABI) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } else -#endif - { - new.si_utime = from->si_utime; - new.si_stime = from->si_stime; - } + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; break; case SIL_RT: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_int = from->si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - new.si_call_addr = ptr_to_compat(from->si_call_addr); - new.si_syscall = from->si_syscall; - new.si_arch = from->si_arch; + to->si_call_addr = ptr_to_compat(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; - return 0; } -- cgit v1.2.3 From dcbd21c9fca5e954fd4e3d91884907eb6d47187e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Apr 2020 14:49:09 +0900 Subject: tracing/kprobes: Fix a double initialization typo Fix a typo that resulted in an unnecessary double initialization to addr. Link: http://lkml.kernel.org/r/158779374968.6082.2337484008464939919.stgit@devnote2 Cc: Tom Zanussi Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: c7411a1a126f ("tracing/kprobe: Check whether the non-suffixed symbol is notrace") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d0568af4a0ef..0d9300c3b084 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -453,7 +453,7 @@ static bool __within_notrace_func(unsigned long addr) static bool within_notrace_func(struct trace_kprobe *tk) { - unsigned long addr = addr = trace_kprobe_address(tk); + unsigned long addr = trace_kprobe_address(tk); char symname[KSYM_NAME_LEN], *p; if (!__within_notrace_func(addr)) -- cgit v1.2.3 From da0f1f4167e3af69e1d8b32d6d65195ddd2bfb64 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Apr 2020 14:49:17 +0900 Subject: tracing/boottime: Fix kprobe event API usage Fix boottime kprobe events to use API correctly for multiple events. For example, when we set a multiprobe kprobe events in bootconfig like below, ftrace.event.kprobes.myevent { probes = "vfs_read $arg1 $arg2", "vfs_write $arg1 $arg2" } This cause an error; trace_boot: Failed to add probe: p:kprobes/myevent (null) vfs_read $arg1 $arg2 vfs_write $arg1 $arg2 This shows the 1st argument becomes NULL and multiprobes are merged to 1 probe. Link: http://lkml.kernel.org/r/158779375766.6082.201939936008972838.stgit@devnote2 Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 29a154810546 ("tracing: Change trace_boot to use kprobe_event interface") Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 06d7feb5255f..9de29bb45a27 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -95,24 +95,20 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) struct xbc_node *anode; char buf[MAX_BUF_LEN]; const char *val; - int ret; + int ret = 0; - kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); + xbc_node_for_each_array_value(node, "probes", anode, val) { + kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); - ret = kprobe_event_gen_cmd_start(&cmd, event, NULL); - if (ret) - return ret; + ret = kprobe_event_gen_cmd_start(&cmd, event, val); + if (ret) + break; - xbc_node_for_each_array_value(node, "probes", anode, val) { - ret = kprobe_event_add_field(&cmd, val); + ret = kprobe_event_gen_cmd_end(&cmd); if (ret) - return ret; + pr_err("Failed to add probe: %s\n", buf); } - ret = kprobe_event_gen_cmd_end(&cmd); - if (ret) - pr_err("Failed to add probe: %s\n", buf); - return ret; } #else -- cgit v1.2.3 From 5b4dcd2d201a395ad4054067bfae4a07554fbd65 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Apr 2020 14:49:26 +0900 Subject: tracing/kprobes: Reject new event if loc is NULL Reject the new event which has NULL location for kprobes. For kprobes, user must specify at least the location. Link: http://lkml.kernel.org/r/158779376597.6082.1411212055469099461.stgit@devnote2 Cc: Tom Zanussi Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 2a588dd1d5d6 ("tracing: Add kprobe event command generation functions") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0d9300c3b084..35989383ae11 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -940,6 +940,9 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init); * complete command or only the first part of it; in the latter case, * kprobe_event_add_fields() can be used to add more fields following this. * + * Unlikely the synth_event_gen_cmd_start(), @loc must be specified. This + * returns -EINVAL if @loc == NULL. + * * Return: 0 if successful, error otherwise. */ int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, @@ -953,6 +956,9 @@ int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, if (cmd->type != DYNEVENT_TYPE_KPROBE) return -EINVAL; + if (!loc) + return -EINVAL; + if (kretprobe) snprintf(buf, MAX_EVENT_NAME_LEN, "r:kprobes/%s", name); else -- cgit v1.2.3 From 19acd03d95dad1f50d06f28179a1866fca431fed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Apr 2020 17:47:29 +0200 Subject: kcsan: Add __kcsan_{enable,disable}_current() variants The __kcsan_{enable,disable}_current() variants only call into KCSAN if KCSAN is enabled for the current compilation unit. Note: This is typically not what we want, as we usually want to ensure that even calls into other functions still have KCSAN disabled. These variants may safely be used in header files that are shared between regular kernel code and code that does not link the KCSAN runtime. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index a572aae61b98..a73a66cf79df 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -625,6 +625,13 @@ void kcsan_enable_current(void) } EXPORT_SYMBOL(kcsan_enable_current); +void kcsan_enable_current_nowarn(void) +{ + if (get_ctx()->disable_count-- == 0) + kcsan_disable_current(); +} +EXPORT_SYMBOL(kcsan_enable_current_nowarn); + void kcsan_nestable_atomic_begin(void) { /* -- cgit v1.2.3 From 565558558985b1d7cd43b21f18c1ad6b232788d0 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 30 Apr 2020 12:40:03 +0100 Subject: cpu/hotplug: Remove disable_nonboot_cpus() The single user could have called freeze_secondary_cpus() directly. Since this function was a source of confusion, remove it as it's just a pointless wrapper. While at it, rename enable_nonboot_cpus() to thaw_secondary_cpus() to preserve the naming symmetry. Done automatically via: git grep -l enable_nonboot_cpus | xargs sed -i 's/enable_nonboot_cpus/thaw_secondary_cpus/g' Signed-off-by: Qais Yousef Signed-off-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/20200430114004.17477-1-qais.yousef@arm.com --- kernel/cpu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6a02d4434cf7..d766929e0b7e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1376,8 +1376,8 @@ int __freeze_secondary_cpus(int primary, bool suspend) /* * Make sure the CPUs won't be enabled by someone else. We need to do - * this even in case of failure as all disable_nonboot_cpus() users are - * supposed to do enable_nonboot_cpus() on the failure path. + * this even in case of failure as all freeze_secondary_cpus() users are + * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; @@ -1385,15 +1385,15 @@ int __freeze_secondary_cpus(int primary, bool suspend) return error; } -void __weak arch_enable_nonboot_cpus_begin(void) +void __weak arch_thaw_secondary_cpus_begin(void) { } -void __weak arch_enable_nonboot_cpus_end(void) +void __weak arch_thaw_secondary_cpus_end(void) { } -void enable_nonboot_cpus(void) +void thaw_secondary_cpus(void) { int cpu, error; @@ -1405,7 +1405,7 @@ void enable_nonboot_cpus(void) pr_info("Enabling non-boot CPUs ...\n"); - arch_enable_nonboot_cpus_begin(); + arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); @@ -1418,7 +1418,7 @@ void enable_nonboot_cpus(void) pr_warn("Error taking CPU%d up: %d\n", cpu, error); } - arch_enable_nonboot_cpus_end(); + arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: -- cgit v1.2.3 From fb7fb84a0c4e8021ddecb157802d58241a3f1a40 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 30 Apr 2020 12:40:04 +0100 Subject: cpu/hotplug: Remove __freeze_secondary_cpus() The refactored function is no longer required as the codepaths that call freeze_secondary_cpus() are all suspend/resume related now. Signed-off-by: Qais Yousef Signed-off-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/20200430114004.17477-2-qais.yousef@arm.com --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d766929e0b7e..9f892144db6b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1327,7 +1327,7 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus) #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int __freeze_secondary_cpus(int primary, bool suspend) +int freeze_secondary_cpus(int primary) { int cpu, error = 0; @@ -1352,7 +1352,7 @@ int __freeze_secondary_cpus(int primary, bool suspend) if (cpu == primary) continue; - if (suspend && pm_wakeup_pending()) { + if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; -- cgit v1.2.3 From a13502073638a0b28d84099fa985971fe3287aee Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 6 May 2020 18:17:27 +0300 Subject: kgdb: Drop malformed kernel doc comment Kernel doc does not understand POD variables to be referred to. .../debug_core.c:73: warning: cannot understand function prototype: 'int kgdb_connected; ' Convert kernel doc to pure comment. Fixes: dc7d55270521 ("kgdb: core") Cc: Jason Wessel Signed-off-by: Andy Shevchenko Reviewed-by: Douglas Anderson Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2b7c9b67931d..2266ba27f27d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -67,9 +67,7 @@ static int kgdb_break_asap; struct debuggerinfo_struct kgdb_info[NR_CPUS]; -/** - * kgdb_connected - Is a host GDB connected to us? - */ +/* kgdb_connected - Is a host GDB connected to us? */ int kgdb_connected; EXPORT_SYMBOL_GPL(kgdb_connected); -- cgit v1.2.3 From 19a8ff956c5abaaedfae23b8a951dd2d725a2171 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Mar 2020 17:39:12 -0700 Subject: rcutorture: Add flag to produce non-busy-wait task stalls This commit aids testing of RCU task stall warning messages by adding an rcutorture.stall_cpu_block module parameter that results in the induced stall sleeping within the RCU read-side critical section. Spinning with interrupts disabled is still available via the rcutorture.stall_cpu_irqsoff module parameter, and specifying neither of these two module parameters will spin with preemption disabled. Note that sleeping (as opposed to preemption) results in additional complaints from RCU at context-switch time, so yet more testing. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d0345d14e22a..60dc36893aad 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -114,6 +114,7 @@ torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); +torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1548,6 +1549,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " + "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1556,6 +1558,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, + stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1611,6 +1614,7 @@ static int rcutorture_booster_init(unsigned int cpu) */ static int rcu_torture_stall(void *args) { + int idx; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); @@ -1622,21 +1626,22 @@ static int rcu_torture_stall(void *args) if (!kthread_should_stop()) { stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - rcu_read_lock(); + idx = cur_ops->readlock(); if (stall_cpu_irqsoff) local_irq_disable(); - else + else if (!stall_cpu_block) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", - smp_processor_id()); + raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ + if (stall_cpu_block) + schedule_timeout_uninterruptible(HZ); if (stall_cpu_irqsoff) local_irq_enable(); - else + else if (!stall_cpu_block) preempt_enable(); - rcu_read_unlock(); + cur_ops->readunlock(idx); pr_alert("rcu_torture_stall end.\n"); } torture_shutdown_absorb("rcu_torture_stall"); -- cgit v1.2.3 From 55b2dcf58700041d6f0b037a98619222c825f004 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Apr 2020 19:57:52 -0700 Subject: rcu: Allow rcutorture to starve grace-period kthread This commit provides an rcutorture.stall_gp_kthread module parameter to allow rcutorture to starve the grace-period kthread. This allows testing the code that detects such starvation. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 18 +++++++++++++++--- kernel/rcu/tree.c | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 00ddc92c5774..cdbc5f98f584 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -454,6 +454,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +void rcu_gp_set_torture_wait(int duration); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) @@ -471,6 +472,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ do { } while (0) #endif +static inline void rcu_gp_set_torture_wait(int duration) { } #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 60dc36893aad..3d47dca4d61c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -115,6 +115,8 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_gp_kthread, 0, + "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1623,7 +1625,17 @@ static int rcu_torture_stall(void *args) schedule_timeout_interruptible(stall_cpu_holdoff * HZ); VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } - if (!kthread_should_stop()) { + if (!kthread_should_stop() && stall_gp_kthread > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall"); + rcu_gp_set_torture_wait(stall_gp_kthread * HZ); + for (idx = 0; idx < stall_gp_kthread + 2; idx++) { + if (kthread_should_stop()) + break; + schedule_timeout_uninterruptible(HZ); + } + } + if (!kthread_should_stop() && stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall"); stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ idx = cur_ops->readlock(); @@ -1642,8 +1654,8 @@ static int rcu_torture_stall(void *args) else if (!stall_cpu_block) preempt_enable(); cur_ops->readunlock(idx); - pr_alert("rcu_torture_stall end.\n"); } + pr_alert("rcu_torture_stall end.\n"); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -1653,7 +1665,7 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - if (stall_cpu <= 0) + if (stall_cpu <= 0 && stall_gp_kthread <= 0) return 0; return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 156ac8d0418b..be7dde8807d0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1486,6 +1486,31 @@ static void rcu_gp_slow(int delay) schedule_timeout_uninterruptible(delay); } +static unsigned long sleep_duration; + +/* Allow rcutorture to stall the grace-period kthread. */ +void rcu_gp_set_torture_wait(int duration) +{ + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) + WRITE_ONCE(sleep_duration, duration); +} +EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); + +/* Actually implement the aforementioned wait. */ +static void rcu_gp_torture_wait(void) +{ + unsigned long duration; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) + return; + duration = xchg(&sleep_duration, 0UL); + if (duration > 0) { + pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); + schedule_timeout_uninterruptible(duration); + pr_alert("%s: Wait complete\n", __func__); + } +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1686,6 +1711,7 @@ static void rcu_gp_fqs_loop(void) rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ @@ -1834,6 +1860,7 @@ static int __noreturn rcu_gp_kthread(void *unused) swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init()) -- cgit v1.2.3 From afbc1574f1da13d2fd2b30a96090b37c5933f957 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 9 Apr 2020 19:42:38 +0800 Subject: rcutorture: Make rcu_fwds and rcu_fwd_emergency_stop static This commit fixes the following sparse warning: kernel/rcu/rcutorture.c:1695:16: warning: symbol 'rcu_fwds' was not declared. Should it be static? kernel/rcu/rcutorture.c:1696:6: warning: symbol 'rcu_fwd_emergency_stop' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3d47dca4d61c..c7b7594bd2d8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1721,8 +1721,8 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd *rcu_fwds; -bool rcu_fwd_emergency_stop; +static struct rcu_fwd *rcu_fwds; +static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { -- cgit v1.2.3 From 3c80b4024579150ddb8ddd6fd7b110ba192aca3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 15:37:12 -0700 Subject: rcutorture: Convert ULONG_CMP_LT() to time_before() This commit converts three ULONG_CMP_LT() invocations in rcutorture to time_before() to reflect the fact that they are comparing timestamps to the jiffies counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c7b7594bd2d8..fc961472dc8e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -848,7 +848,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { + while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) @@ -858,7 +858,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { + while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ @@ -929,7 +929,7 @@ rcu_torture_fqs(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { schedule_timeout_interruptible(1); } -- cgit v1.2.3 From d16a8c31077e75ecb9427fbfea59b74eed00f698 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 6 May 2020 10:20:10 -0400 Subject: tracing: Wait for preempt irq delay thread to finish Running on a slower machine, it is possible that the preempt delay kernel thread may still be executing if the module was immediately removed after added, and this can cause the kernel to crash as the kernel thread might be executing after its code has been removed. There's no reason that the caller of the code shouldn't just wait for the delay thread to finish, as the thread can also be created by a trigger in the sysfs code, which also has the same issues. Link: http://lore.kernel.org/r/5EA2B0C8.2080706@cn.fujitsu.com Cc: stable@vger.kernel.org Fixes: 793937236d1ee ("lib: Add module for testing preemptoff/irqsoff latency tracers") Reported-by: Xiao Yang Reviewed-by: Xiao Yang Reviewed-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/preemptirq_delay_test.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 31c0fad4cb9e..c4c86de63cf9 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -113,22 +113,42 @@ static int preemptirq_delay_run(void *data) for (i = 0; i < s; i++) (testfuncs[i])(i); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + __set_current_state(TASK_RUNNING); + return 0; } -static struct task_struct *preemptirq_start_test(void) +static int preemptirq_run_test(void) { + struct task_struct *task; + char task_name[50]; snprintf(task_name, sizeof(task_name), "%s_test", test_mode); - return kthread_run(preemptirq_delay_run, NULL, task_name); + task = kthread_run(preemptirq_delay_run, NULL, task_name); + if (IS_ERR(task)) + return PTR_ERR(task); + if (task) + kthread_stop(task); + return 0; } static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - preemptirq_start_test(); + ssize_t ret; + + ret = preemptirq_run_test(); + if (ret) + return ret; return count; } @@ -148,11 +168,9 @@ static struct kobject *preemptirq_delay_kobj; static int __init preemptirq_delay_init(void) { - struct task_struct *test_task; int retval; - test_task = preemptirq_start_test(); - retval = PTR_ERR_OR_ZERO(test_task); + retval = preemptirq_run_test(); if (retval != 0) return retval; -- cgit v1.2.3 From 11f5efc3ab66284f7aaacc926e9351d658e2577b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 6 May 2020 10:36:18 -0400 Subject: tracing: Add a vmalloc_sync_mappings() for safe measure x86_64 lazily maps in the vmalloc pages, and the way this works with per_cpu areas can be complex, to say the least. Mappings may happen at boot up, and if nothing synchronizes the page tables, those page mappings may not be synced till they are used. This causes issues for anything that might touch one of those mappings in the path of the page fault handler. When one of those unmapped mappings is touched in the page fault handler, it will cause another page fault, which in turn will cause a page fault, and leave us in a loop of page faults. Commit 763802b53a42 ("x86/mm: split vmalloc_sync_all()") split vmalloc_sync_all() into vmalloc_sync_unmappings() and vmalloc_sync_mappings(), as on system exit, it did not need to do a full sync on x86_64 (although it still needed to be done on x86_32). By chance, the vmalloc_sync_all() would synchronize the page mappings done at boot up and prevent the per cpu area from being a problem for tracing in the page fault handler. But when that synchronization in the exit of a task became a nop, it caused the problem to appear. Link: https://lore.kernel.org/r/20200429054857.66e8e333@oasis.local.home Cc: stable@vger.kernel.org Fixes: 737223fbca3b1 ("tracing: Consolidate buffer allocation code") Reported-by: "Tzvetomir Stoyanov (VMware)" Suggested-by: Joerg Roedel Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..9ed6d92768af 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8525,6 +8525,19 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) */ allocate_snapshot = false; #endif + + /* + * Because of some magic with the way alloc_percpu() works on + * x86_64, we need to synchronize the pgd of all the tables, + * otherwise the trace events that happen in x86_64 page fault + * handlers can't cope with accessing the chance that a + * alloc_percpu()'d memory might be touched in the page fault trace + * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() + * calls in tracing, because something might get triggered within a + * page fault trace event! + */ + vmalloc_sync_mappings(); + return 0; } -- cgit v1.2.3 From 192b7993b3ff92b62b687e940e5e88fa0218d764 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Thu, 23 Apr 2020 12:08:25 +0800 Subject: tracing: Make tracing_snapshot_instance_cond() static Fix the following sparse warning: kernel/trace/trace.c:950:6: warning: symbol 'tracing_snapshot_instance_cond' was not declared. Should it be static? Link: http://lkml.kernel.org/r/1587614905-48692-1-git-send-email-zou_wei@huawei.com Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ed6d92768af..29615f15a820 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -947,7 +947,8 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) +static void tracing_snapshot_instance_cond(struct trace_array *tr, + void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; -- cgit v1.2.3 From 96ecee29b0b560662ec082ee9b6f2049f2a79090 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 3 May 2020 06:48:17 -0500 Subject: exec: Merge install_exec_creds into setup_new_exec The two functions are now always called one right after the other so merge them together to make future maintenance easier. Reviewed-by: Kees Cook Reviewed-by: Greg Ungerer Signed-off-by: "Eric W. Biederman" --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 633b4ae72ed5..169449b5e56b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12217,7 +12217,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_mutex held when called from - * install_exec_creds(). + * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { -- cgit v1.2.3 From dcf550e52f567cb7a421169d2522869f9188aca5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:43 -0500 Subject: livepatch: Disallow vmlinux.ko This is purely a theoretical issue, but if there were a module named vmlinux.ko, the livepatch relocation code wouldn't be able to distinguish between vmlinux-specific and vmlinux.o-specific KLP relocations. If CONFIG_LIVEPATCH is enabled, don't allow a module named vmlinux.ko. Suggested-by: Peter Zijlstra Signed-off-by: Josh Poimboeuf Acked-by: Miroslav Benes Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c3512e7e0801..40cfac8156fd 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1139,6 +1139,11 @@ int klp_module_coming(struct module *mod) if (WARN_ON(mod->state != MODULE_STATE_COMING)) return -EINVAL; + if (!strcmp(mod->name, "vmlinux")) { + pr_err("vmlinux.ko: invalid module name"); + return -EINVAL; + } + mutex_lock(&klp_mutex); /* * Each module has to know that klp_module_coming() -- cgit v1.2.3 From 7c8e2bdd5f0d990e2398ee3deafc626dd469fc2d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:44 -0500 Subject: livepatch: Apply vmlinux-specific KLP relocations early KLP relocations are livepatch-specific relocations which are applied to a KLP module's text or data. They exist for two reasons: 1) Unexported symbols: replacement functions often need to access unexported symbols (e.g. static functions), which "normal" relocations don't allow. 2) Late module patching: this is the ability for a KLP module to bypass normal module dependencies, such that the KLP module can be loaded *before* a to-be-patched module. This means that relocations which need to access symbols in the to-be-patched module might need to be applied to the KLP module well after it has been loaded. Non-late-patched KLP relocations are applied from the KLP module's init function. That usually works fine, unless the patched code wants to use alternatives, paravirt patching, jump tables, or some other special section which needs relocations. Then we run into ordering issues and crashes. In order for those special sections to work properly, the KLP relocations should be applied *before* the special section init code runs, such as apply_paravirt(), apply_alternatives(), or jump_label_apply_nops(). You might think the obvious solution would be to move the KLP relocation initialization earlier, but it's not necessarily that simple. The problem is the above-mentioned late module patching, for which KLP relocations can get applied well after the KLP module is loaded. To "fix" this issue in the past, we created .klp.arch sections: .klp.arch.{module}..altinstructions .klp.arch.{module}..parainstructions Those sections allow KLP late module patching code to call apply_paravirt() and apply_alternatives() after the module-specific KLP relocations (.klp.rela.{module}.{section}) have been applied. But that has a lot of drawbacks, including code complexity, the need for arch-specific code, and the (per-arch) danger that we missed some special section -- for example the __jump_table section which is used for jump labels. It turns out there's a simpler and more functional approach. There are two kinds of KLP relocation sections: 1) vmlinux-specific KLP relocation sections .klp.rela.vmlinux.{sec} These are relocations (applied to the KLP module) which reference unexported vmlinux symbols. 2) module-specific KLP relocation sections .klp.rela.{module}.{sec}: These are relocations (applied to the KLP module) which reference unexported or exported module symbols. Up until now, these have been treated the same. However, they're inherently different. Because of late module patching, module-specific KLP relocations can be applied very late, thus they can create the ordering headaches described above. But vmlinux-specific KLP relocations don't have that problem. There's nothing to prevent them from being applied earlier. So apply them at the same time as normal relocations, when the KLP module is being loaded. This means that for vmlinux-specific KLP relocations, we no longer have any ordering issues. vmlinux-referencing jump labels, alternatives, and paravirt patching will work automatically, without the need for the .klp.arch hacks. All that said, for module-specific KLP relocations, the ordering problems still exist and we *do* still need .klp.arch. Or do we? Stay tuned. Suggested-by: Peter Zijlstra Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 137 ++++++++++++++++++++++++++++++------------------ kernel/module.c | 10 ++-- 2 files changed, 92 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 40cfac8156fd..c02791e5c75b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -191,12 +191,12 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) +static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, + unsigned int symndx, Elf_Shdr *relasec) { int i, cnt, vmlinux, ret; char objname[MODULE_NAME_LEN]; char symname[KSYM_NAME_LEN]; - char *strtab = pmod->core_kallsyms.strtab; Elf_Rela *relas; Elf_Sym *sym; unsigned long sympos, addr; @@ -216,7 +216,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); + sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); @@ -246,54 +246,59 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) return 0; } -static int klp_write_object_relocations(struct module *pmod, - struct klp_object *obj) +/* + * At a high-level, there are two types of klp relocation sections: those which + * reference symbols which live in vmlinux; and those which reference symbols + * which live in other modules. This function is called for both types: + * + * 1) When a klp module itself loads, the module code calls this function to + * write vmlinux-specific klp relocations (.klp.rela.vmlinux.* sections). + * These relocations are written to the klp module text to allow the patched + * code/data to reference unexported vmlinux symbols. They're written as + * early as possible to ensure that other module init code (.e.g., + * jump_label_apply_nops) can access any unexported vmlinux symbols which + * might be referenced by the klp module's special sections. + * + * 2) When a to-be-patched module loads -- or is already loaded when a + * corresponding klp module loads -- klp code calls this function to write + * module-specific klp relocations (.klp.rela.{module}.* sections). These + * are written to the klp module text to allow the patched code/data to + * reference symbols which live in the to-be-patched module or one of its + * module dependencies. Exported symbols are supported, in addition to + * unexported symbols, in order to enable late module patching, which allows + * the to-be-patched module to be loaded and patched sometime *after* the + * klp module is loaded. + */ +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symndx, unsigned int secndx, + const char *objname) { - int i, cnt, ret = 0; - const char *objname, *secname; + int cnt, ret; char sec_objname[MODULE_NAME_LEN]; - Elf_Shdr *sec; + Elf_Shdr *sec = sechdrs + secndx; - if (WARN_ON(!klp_is_object_loaded(obj))) + /* + * Format: .klp.rela.sec_objname.section_name + * See comment in klp_resolve_symbols() for an explanation + * of the selected field width value. + */ + cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + sec_objname); + if (cnt != 1) { + pr_err("section %s has an incorrectly formatted name\n", + shstrtab + sec->sh_name); return -EINVAL; + } - objname = klp_is_module(obj) ? obj->name : "vmlinux"; - - /* For each klp relocation section */ - for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { - sec = pmod->klp_info->sechdrs + i; - secname = pmod->klp_info->secstrings + sec->sh_name; - if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) - continue; - - /* - * Format: .klp.rela.sec_objname.section_name - * See comment in klp_resolve_symbols() for an explanation - * of the selected field width value. - */ - cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); - if (cnt != 1) { - pr_err("section %s has an incorrectly formatted name\n", - secname); - ret = -EINVAL; - break; - } - - if (strcmp(objname, sec_objname)) - continue; - - ret = klp_resolve_symbols(sec, pmod); - if (ret) - break; + if (strcmp(objname ? objname : "vmlinux", sec_objname)) + return 0; - ret = apply_relocate_add(pmod->klp_info->sechdrs, - pmod->core_kallsyms.strtab, - pmod->klp_info->symndx, i, pmod); - if (ret) - break; - } + ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec); + if (ret) + return ret; - return ret; + return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); } /* @@ -730,6 +735,28 @@ void __weak arch_klp_init_object_loaded(struct klp_patch *patch, { } +int klp_apply_object_relocs(struct klp_patch *patch, struct klp_object *obj) +{ + int i, ret; + struct klp_modinfo *info = patch->mod->klp_info; + + for (i = 1; i < info->hdr.e_shnum; i++) { + Elf_Shdr *sec = info->sechdrs + i; + + if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) + continue; + + ret = klp_apply_section_relocs(patch->mod, info->sechdrs, + info->secstrings, + patch->mod->core_kallsyms.strtab, + info->symndx, i, obj->name); + if (ret) + return ret; + } + + return 0; +} + /* parts of the initialization that is done only when the object is loaded */ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_object *obj) @@ -738,18 +765,26 @@ static int klp_init_object_loaded(struct klp_patch *patch, int ret; mutex_lock(&text_mutex); - module_disable_ro(patch->mod); - ret = klp_write_object_relocations(patch->mod, obj); - if (ret) { - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); - return ret; + + if (klp_is_module(obj)) { + /* + * Only write module-specific relocations here + * (.klp.rela.{module}.*). vmlinux-specific relocations were + * written earlier during the initialization of the klp module + * itself. + */ + ret = klp_apply_object_relocs(patch, obj); + if (ret) { + module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + return ret; + } } arch_klp_init_object_loaded(patch, obj); - module_enable_ro(patch->mod, true); + module_enable_ro(patch->mod, true); mutex_unlock(&text_mutex); klp_for_each_func(obj, func) { diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..fdd9f6970e9a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2334,11 +2334,13 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) continue; - /* Livepatch relocation sections are applied by livepatch */ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) - continue; - - if (info->sechdrs[i].sh_type == SHT_REL) + err = klp_apply_section_relocs(mod, info->sechdrs, + info->secstrings, + info->strtab, + info->index.sym, i, + NULL); + else if (info->sechdrs[i].sh_type == SHT_REL) err = apply_relocate(info->sechdrs, info->strtab, info->index.sym, i, mod); else if (info->sechdrs[i].sh_type == SHT_RELA) -- cgit v1.2.3 From 1d05334d2899bd3ecdf01beb53f0a70884a7f471 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 10:24:45 -0500 Subject: livepatch: Remove .klp.arch After the previous patch, vmlinux-specific KLP relocations are now applied early during KLP module load. This means that .klp.arch sections are no longer needed for *vmlinux-specific* KLP relocations. One might think they're still needed for *module-specific* KLP relocations. If a to-be-patched module is loaded *after* its corresponding KLP module is loaded, any corresponding KLP relocations will be delayed until the to-be-patched module is loaded. If any special sections (.parainstructions, for example) rely on those relocations, their initializations (apply_paravirt) need to be done afterwards. Thus the apparent need for arch_klp_init_object_loaded() and its corresponding .klp.arch sections -- it allows some of the special section initializations to be done at a later time. But... if you look closer, that dependency between the special sections and the module-specific KLP relocations doesn't actually exist in reality. Looking at the contents of the .altinstructions and .parainstructions sections, there's not a realistic scenario in which a KLP module's .altinstructions or .parainstructions section needs to access a symbol in a to-be-patched module. It might need to access a local symbol or even a vmlinux symbol; but not another module's symbol. When a special section needs to reference a local or vmlinux symbol, a normal rela can be used instead of a KLP rela. Since the special section initializations don't actually have any real dependency on module-specific KLP relocations, .klp.arch and arch_klp_init_object_loaded() no longer have a reason to exist. So remove them. As Peter said much more succinctly: So the reason for .klp.arch was that .klp.rela.* stuff would overwrite paravirt instructions. If that happens you're doing it wrong. Those RELAs are core kernel, not module, and thus should've happened in .rela.* sections at patch-module loading time. Reverting this removes the two apply_{paravirt,alternatives}() calls from the late patching path, and means we don't have to worry about them when removing module_disable_ro(). [ jpoimboe: Rewrote patch description. Tweaked klp_init_object_loaded() error path. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c02791e5c75b..16632e75112a 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -729,12 +729,6 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } -/* Arches may override this to finish any remaining arch-specific tasks */ -void __weak arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) -{ -} - int klp_apply_object_relocs(struct klp_patch *patch, struct klp_object *obj) { int i, ret; @@ -764,10 +758,11 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; - mutex_lock(&text_mutex); - module_disable_ro(patch->mod); - if (klp_is_module(obj)) { + + mutex_lock(&text_mutex); + module_disable_ro(patch->mod); + /* * Only write module-specific relocations here * (.klp.rela.{module}.*). vmlinux-specific relocations were @@ -775,17 +770,13 @@ static int klp_init_object_loaded(struct klp_patch *patch, * itself. */ ret = klp_apply_object_relocs(patch, obj); - if (ret) { - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); - return ret; - } - } - arch_klp_init_object_loaded(patch, obj); + module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); + if (ret) + return ret; + } klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, -- cgit v1.2.3 From ca376a9374867d09ece6f61803764fb187201294 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:46 -0500 Subject: livepatch: Prevent module-specific KLP rela sections from referencing vmlinux symbols Prevent module-specific KLP rela sections from referencing vmlinux symbols. This helps prevent ordering issues with module special section initializations. Presumably such symbols are exported and normal relas can be used instead. Suggested-by: Peter Zijlstra Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 16632e75112a..f9ebb54affab 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -192,17 +192,20 @@ static int klp_find_object_symbol(const char *objname, const char *name, } static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, - unsigned int symndx, Elf_Shdr *relasec) + unsigned int symndx, Elf_Shdr *relasec, + const char *sec_objname) { - int i, cnt, vmlinux, ret; - char objname[MODULE_NAME_LEN]; - char symname[KSYM_NAME_LEN]; + int i, cnt, ret; + char sym_objname[MODULE_NAME_LEN]; + char sym_name[KSYM_NAME_LEN]; Elf_Rela *relas; Elf_Sym *sym; unsigned long sympos, addr; + bool sym_vmlinux; + bool sec_vmlinux = !strcmp(sec_objname, "vmlinux"); /* - * Since the field widths for objname and symname in the sscanf() + * Since the field widths for sym_objname and sym_name in the sscanf() * call are hard-coded and correspond to MODULE_NAME_LEN and * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN * and KSYM_NAME_LEN have the values we expect them to have. @@ -223,20 +226,33 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, return -EINVAL; } - /* Format: .klp.sym.objname.symname,sympos */ + /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, ".klp.sym.%55[^.].%127[^,],%lu", - objname, symname, &sympos); + sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", strtab + sym->st_name); return -EINVAL; } + sym_vmlinux = !strcmp(sym_objname, "vmlinux"); + + /* + * Prevent module-specific KLP rela sections from referencing + * vmlinux symbols. This helps prevent ordering issues with + * module special section initializations. Presumably such + * symbols are exported and normal relas can be used instead. + */ + if (!sec_vmlinux && sym_vmlinux) { + pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section", + sym_name); + return -EINVAL; + } + /* klp_find_object_symbol() treats a NULL objname as vmlinux */ - vmlinux = !strcmp(objname, "vmlinux"); - ret = klp_find_object_symbol(vmlinux ? NULL : objname, - symname, sympos, &addr); + ret = klp_find_object_symbol(sym_vmlinux ? NULL : sym_objname, + sym_name, sympos, &addr); if (ret) return ret; @@ -294,7 +310,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, if (strcmp(objname ? objname : "vmlinux", sec_objname)) return 0; - ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec); + ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname); if (ret) return ret; -- cgit v1.2.3 From d556e1be33320366272ec02f93f98d7f308479f1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:50 -0500 Subject: livepatch: Remove module_disable_ro() usage With arch_klp_init_object_loaded() gone, and apply_relocate_add() now using text_poke(), livepatch no longer needs to use module_disable_ro(). Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index f9ebb54affab..6b8b3c067be0 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -777,7 +777,6 @@ static int klp_init_object_loaded(struct klp_patch *patch, if (klp_is_module(obj)) { mutex_lock(&text_mutex); - module_disable_ro(patch->mod); /* * Only write module-specific relocations here @@ -787,7 +786,6 @@ static int klp_init_object_loaded(struct klp_patch *patch, */ ret = klp_apply_object_relocs(patch, obj); - module_enable_ro(patch->mod, true); mutex_unlock(&text_mutex); if (ret) -- cgit v1.2.3 From 0d9fbf78fefb421a3af97394ce80bba0db4f046a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:51 -0500 Subject: module: Remove module_disable_ro() module_disable_ro() has no more users. Remove it. Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- kernel/module.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index fdd9f6970e9a..3ba024afe379 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1997,19 +1997,6 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } -/* livepatching wants to disable read-only so it can frob module. */ -void module_disable_ro(const struct module *mod) -{ - if (!rodata_enabled) - return; - - frob_text(&mod->core_layout, set_memory_rw); - frob_rodata(&mod->core_layout, set_memory_rw); - frob_ro_after_init(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - frob_rodata(&mod->init_layout, set_memory_rw); -} - void module_enable_ro(const struct module *mod, bool after_init) { if (!rodata_enabled) -- cgit v1.2.3 From 5b384f933590a086ca9a0abdc2e55e41107ac440 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:52 -0500 Subject: x86/module: Use text_mutex in apply_relocate_add() Now that the livepatch code no longer needs the text_mutex for changing module permissions, move its usage down to apply_relocate_add(). Note the s390 version of apply_relocate_add() doesn't need to use the text_mutex because it already uses s390_kernel_write_lock, which accomplishes the same task. Signed-off-by: Josh Poimboeuf Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 6b8b3c067be0..96d2da14eb0d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -775,9 +775,6 @@ static int klp_init_object_loaded(struct klp_patch *patch, int ret; if (klp_is_module(obj)) { - - mutex_lock(&text_mutex); - /* * Only write module-specific relocations here * (.klp.rela.{module}.*). vmlinux-specific relocations were @@ -785,9 +782,6 @@ static int klp_init_object_loaded(struct klp_patch *patch, * itself. */ ret = klp_apply_object_relocs(patch, obj); - - mutex_unlock(&text_mutex); - if (ret) return ret; } -- cgit v1.2.3 From e6eff4376e2897c2e14b70d87bf7284cdb093830 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:53 -0500 Subject: module: Make module_enable_ro() static again Now that module_enable_ro() has no more external users, make it static again. Suggested-by: Jessica Yu Signed-off-by: Josh Poimboeuf Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3ba024afe379..a26343ea4d50 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1997,7 +1997,7 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } -void module_enable_ro(const struct module *mod, bool after_init) +static void module_enable_ro(const struct module *mod, bool after_init) { if (!rodata_enabled) return; @@ -2025,6 +2025,7 @@ static void module_enable_nx(const struct module *mod) #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } +static void module_enable_ro(const struct module *mod, bool after_init) {} #endif /* CONFIG_STRICT_MODULE_RWX */ static void module_enable_x(const struct module *mod) { -- cgit v1.2.3 From 324cfb19567c80ed71d7a02f1d5ff4621902f4c3 Mon Sep 17 00:00:00 2001 From: Maciej Grochowski Date: Thu, 7 May 2020 18:35:49 -0700 Subject: kernel/kcov.c: fix typos in kcov_remote_start documentation Signed-off-by: Maciej Grochowski Signed-off-by: Andrew Morton Reviewed-by: Andrey Konovalov Link: http://lkml.kernel.org/r/20200420030259.31674-1-maciek.grochowski@gmail.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index f50354202dbe..8accc9722a81 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -740,8 +740,8 @@ static const struct file_operations kcov_fops = { * kcov_remote_handle() with KCOV_SUBSYSTEM_COMMON as the subsystem id and an * arbitrary 4-byte non-zero number as the instance id). This common handle * then gets saved into the task_struct of the process that issued the - * KCOV_REMOTE_ENABLE ioctl. When this proccess issues system calls that spawn - * kernel threads, the common handle must be retrived via kcov_common_handle() + * KCOV_REMOTE_ENABLE ioctl. When this process issues system calls that spawn + * kernel threads, the common handle must be retrieved via kcov_common_handle() * and passed to the spawned threads via custom annotations. Those kernel * threads must in turn be annotated with kcov_remote_start(common_handle) and * kcov_remote_stop(). All of the threads that are spawned by the same process -- cgit v1.2.3 From 6b0b0fa2bce61db790efc8070ae6e5696435b0a8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 2 May 2020 11:24:25 -0700 Subject: crypto: lib/sha1 - rename "sha" to "sha1" The library implementation of the SHA-1 compression function is confusingly called just "sha_transform()". Alongside it are some "SHA_" constants and "sha_init()". Presumably these are left over from a time when SHA just meant SHA-1. But now there are also SHA-2 and SHA-3, and moreover SHA-1 is now considered insecure and thus shouldn't be used. Therefore, rename these functions and constants to make it very clear that they are for SHA-1. Also add a comment to make it clear that these shouldn't be used. For the extra-misleadingly named "SHA_MESSAGE_BYTES", rename it to SHA1_BLOCK_SIZE and define it to just '64' rather than '(512/8)' so that it matches the same definition in . This prepares for merging into . Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- kernel/bpf/core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..14aa1f74dd10 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -262,10 +262,10 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA_DIGEST_WORDS]; - u32 ws[SHA_WORKSPACE_WORDS]; + u32 digest[SHA1_DIGEST_WORDS]; + u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; @@ -277,7 +277,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) if (!raw) return -ENOMEM; - sha_init(digest); + sha1_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation @@ -308,8 +308,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; - bsize = round_up(psize, SHA_MESSAGE_BYTES); - blocks = bsize / SHA_MESSAGE_BYTES; + bsize = round_up(psize, SHA1_BLOCK_SIZE); + blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); @@ -320,12 +320,12 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { - sha_transform(digest, todo, ws); - todo += SHA_MESSAGE_BYTES; + sha1_transform(digest, todo, ws); + todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; - for (i = 0; i < SHA_DIGEST_WORDS; i++) + for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); -- cgit v1.2.3 From 3f2c788a13143620c5471ac96ac4f033fc9ac3f3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 7 May 2020 12:32:14 +0200 Subject: fork: prevent accidental access to clone3 features Jan reported an issue where an interaction between sign-extending clone's flag argument on ppc64le and the new CLONE_INTO_CGROUP feature causes clone() to consistently fail with EBADF. The whole story is a little longer. The legacy clone() syscall is odd in a bunch of ways and here two things interact. First, legacy clone's flag argument is word-size dependent, i.e. it's an unsigned long whereas most system calls with flag arguments use int or unsigned int. Second, legacy clone() ignores unknown and deprecated flags. The two of them taken together means that users on 64bit systems can pass garbage for the upper 32bit of the clone() syscall since forever and things would just work fine. Just try this on a 64bit kernel prior to v5.7-rc1 where this will succeed and on v5.7-rc1 where this will fail with EBADF: int main(int argc, char *argv[]) { pid_t pid; /* Note that legacy clone() has different argument ordering on * different architectures so this won't work everywhere. * * Only set the upper 32 bits. */ pid = syscall(__NR_clone, 0xffffffff00000000 | SIGCHLD, NULL, NULL, NULL, NULL); if (pid < 0) exit(EXIT_FAILURE); if (pid == 0) exit(EXIT_SUCCESS); if (wait(NULL) != pid) exit(EXIT_FAILURE); exit(EXIT_SUCCESS); } Since legacy clone() couldn't be extended this was not a problem so far and nobody really noticed or cared since nothing in the kernel ever bothered to look at the upper 32 bits. But once we introduced clone3() and expanded the flag argument in struct clone_args to 64 bit we opened this can of worms. With the first flag-based extension to clone3() making use of the upper 32 bits of the flag argument we've effectively made it possible for the legacy clone() syscall to reach clone3() only flags. The sign extension scenario is just the odd corner-case that we needed to figure this out. The reason we just realized this now and not already when we introduced CLONE_CLEAR_SIGHAND was that CLONE_INTO_CGROUP assumes that a valid cgroup file descriptor has been given. So the sign extension (or the user accidently passing garbage for the upper 32 bits) caused the CLONE_INTO_CGROUP bit to be raised and the kernel to error out when it didn't find a valid cgroup file descriptor. Let's fix this by always capping the upper 32 bits for all codepaths that are not aware of clone3() features. This ensures that we can't reach clone3() only features by accident via legacy clone as with the sign extension case and also that legacy clone() works exactly like before, i.e. ignoring any unknown flags. This solution risks no regressions and is also pretty clean. Fixes: 7f192e3cd316 ("fork: add clone3") Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups") Reported-by: Jan Stancek Signed-off-by: Christian Brauner Cc: Arnd Bergmann Cc: Dmitry V. Levin Cc: Andreas Schwab Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: stable@vger.kernel.org # 5.3+ Link: https://sourceware.org/pipermail/libc-alpha/2020-May/113596.html Link: https://lore.kernel.org/r/20200507103214.77218-1-christian.brauner@ubuntu.com --- kernel/fork.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..48ed22774efa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2486,11 +2486,11 @@ long do_fork(unsigned long clone_flags, int __user *child_tidptr) { struct kernel_clone_args args = { - .flags = (clone_flags & ~CSIGNAL), + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, - .exit_signal = (clone_flags & CSIGNAL), + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = stack_start, .stack_size = stack_size, }; @@ -2508,8 +2508,9 @@ long do_fork(unsigned long clone_flags, pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (flags & CSIGNAL), + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), .stack = (unsigned long)fn, .stack_size = (unsigned long)arg, }; @@ -2570,11 +2571,11 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #endif { struct kernel_clone_args args = { - .flags = (clone_flags & ~CSIGNAL), + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, - .exit_signal = (clone_flags & CSIGNAL), + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = newsp, .tls = tls, }; -- cgit v1.2.3 From 52782c92ac85c4e393eb4a903a62e6c24afa633f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 6 May 2020 12:09:34 -0400 Subject: kthread: save thread function It's handy to keep the kthread_fn just as a unique cookie to identify classes of kthreads. E.g. if you can verify that a given task is running your thread_fn, then you may know what sort of type kthread_data points to. We'll use this in nfsd to pass some information into the vfs. Note it will need kthread_data() exported too. Original-patch-by: Tejun Heo Signed-off-by: J. Bruce Fields --- kernel/kthread.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index bfbfa481be3a..b84fc7eec035 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -46,6 +46,7 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; @@ -152,6 +153,20 @@ bool kthread_freezable_should_stop(bool *was_frozen) } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); +/** + * kthread_func - return the function specified on kthread creation + * @task: kthread task in question + * + * Returns NULL if the task is not a kthread. + */ +void *kthread_func(struct task_struct *task) +{ + if (task->flags & PF_KTHREAD) + return to_kthread(task)->threadfn; + return NULL; +} +EXPORT_SYMBOL_GPL(kthread_func); + /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question @@ -164,6 +179,7 @@ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } +EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() @@ -244,6 +260,7 @@ static int kthread(void *_create) do_exit(-ENOMEM); } + self->threadfn = threadfn; self->data = data; init_completion(&self->exited); init_completion(&self->parked); -- cgit v1.2.3 From db803036ada7d61d096783726f9771b3fc540370 Mon Sep 17 00:00:00 2001 From: Vincent Minet Date: Fri, 8 May 2020 00:14:22 +0200 Subject: umh: fix memory leak on execve failure If a UMH process created by fork_usermode_blob() fails to execute, a pair of struct file allocated by umh_pipe_setup() will leak. Under normal conditions, the caller (like bpfilter) needs to manage the lifetime of the UMH and its two pipes. But when fork_usermode_blob() fails, the caller doesn't really have a way to know what needs to be done. It seems better to do the cleanup ourselves in this case. Fixes: 449325b52b7a ("umh: introduce fork_usermode_blob() helper") Signed-off-by: Vincent Minet Signed-off-by: Jakub Kicinski --- kernel/umh.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..20d51e0957e0 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -475,6 +475,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info) { struct umh_info *umh_info = info->data; + /* cleanup if umh_pipe_setup() was successful but exec failed */ + if (info->pid && info->retval) { + fput(umh_info->pipe_to_umh); + fput(umh_info->pipe_from_umh); + } + argv_free(info->argv); umh_info->pid = info->pid; } -- cgit v1.2.3 From f2a8d52e0a4db968c346c4332630a71cba377567 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 5 May 2020 16:04:30 +0200 Subject: nsproxy: add struct nsset Add a simple struct nsset. It holds all necessary pieces to switch to a new set of namespaces without leaving a task in a half-switched state which we will make use of in the next patch. This patch switches the existing setns logic over without causing a change in setns() behavior. This brings setns() closer to how unshare() works(). The prepare_ns() function is responsible to prepare all necessary information. This has two reasons. First it minimizes dependencies between individual namespaces, i.e. all install handler can expect that all fields are properly initialized independent in what order they are called in. Second, this makes the code easier to maintain and easier to follow if it needs to be changed. The prepare_ns() helper will only be switched over to use a flags argument in the next patch. Here it will still use nstype as a simple integer argument which was argued would be clearer. I'm not particularly opinionated about this if it really helps or not. The struct nsset itself already contains the flags field since its name already indicates that it can contain information required by different namespaces. None of this should have functional consequences. Signed-off-by: Christian Brauner Reviewed-by: Serge Hallyn Cc: Eric W. Biederman Cc: Serge Hallyn Cc: Jann Horn Cc: Michael Kerrisk Cc: Aleksa Sarai Link: https://lore.kernel.org/r/20200505140432.181565-2-christian.brauner@ubuntu.com --- kernel/cgroup/namespace.c | 5 +-- kernel/nsproxy.c | 90 ++++++++++++++++++++++++++++++++++++++++------- kernel/pid_namespace.c | 5 +-- kernel/time/namespace.c | 5 +-- kernel/user_namespace.c | 8 ++--- kernel/utsname.c | 5 +-- 6 files changed, 93 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index b05f1dd58a62..812a61afd538 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) return container_of(ns, struct cgroup_namespace, ns); } -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ed9882108cd2..b7954fd60475 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -257,12 +258,79 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +static void put_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + + if (flags & CLONE_NEWUSER) + put_cred(nsset_cred(nsset)); + if (nsset->nsproxy) + free_nsproxy(nsset->nsproxy); +} + +static int prepare_nsset(int nstype, struct nsset *nsset) +{ + struct task_struct *me = current; + + nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs); + if (IS_ERR(nsset->nsproxy)) + return PTR_ERR(nsset->nsproxy); + + if (nstype == CLONE_NEWUSER) + nsset->cred = prepare_creds(); + else + nsset->cred = current_cred(); + if (!nsset->cred) + goto out; + + if (nstype == CLONE_NEWNS) + nsset->fs = me->fs; + + nsset->flags = nstype; + return 0; + +out: + put_nsset(nsset); + return -ENOMEM; +} + +/* + * This is the point of no return. There are just a few namespaces + * that do some actual work here and it's sufficiently minimal that + * a separate ns_common operation seems unnecessary for now. + * Unshare is doing the same thing. If we'll end up needing to do + * more in a given namespace or a helper here is ultimately not + * exported anymore a simple commit handler for each namespace + * should be added to ns_common. + */ +static void commit_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + struct task_struct *me = current; + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + /* transfer ownership */ + commit_creds(nsset_cred(nsset)); + nsset->cred = NULL; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + exit_sem(me); +#endif + + /* transfer ownership */ + switch_task_namespaces(me, nsset->nsproxy); + nsset->nsproxy = NULL; +} + SYSCALL_DEFINE2(setns, int, fd, int, nstype) { - struct task_struct *tsk = current; - struct nsproxy *new_nsproxy; struct file *file; struct ns_common *ns; + struct nsset nsset = {}; int err; file = proc_ns_fget(fd); @@ -274,20 +342,16 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) if (nstype && (ns->ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); - if (IS_ERR(new_nsproxy)) { - err = PTR_ERR(new_nsproxy); + err = prepare_nsset(ns->ops->type, &nsset); + if (err) goto out; - } - err = ns->ops->install(new_nsproxy, ns); - if (err) { - free_nsproxy(new_nsproxy); - goto out; + err = ns->ops->install(&nsset, ns); + if (!err) { + commit_nsset(&nsset); + perf_event_namespaces(current); } - switch_task_namespaces(tsk, new_nsproxy); - - perf_event_namespaces(tsk); + put_nsset(&nsset); out: fput(file); return err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..11db2bdbb41e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int pidns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 53bce347cd50..5d9fc22d836a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,8 +280,9 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +static int timens_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); int err; @@ -289,7 +290,7 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; timens_set_vvar_page(current, ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8eadadc478f9..87804e0371fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns) put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; @@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - cred = prepare_creds(); + cred = nsset_cred(nsset); if (!cred) - return -ENOMEM; + return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); - return commit_creds(cred); + return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) diff --git a/kernel/utsname.c b/kernel/utsname.c index f0e491193009..e488d0e2ab45 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns) put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int utsns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); -- cgit v1.2.3 From 78a5255ffb6a1af189a83e493d916ba1c54d8c75 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 May 2020 13:57:10 -0700 Subject: Stop the ad-hoc games with -Wno-maybe-initialized We have some rather random rules about when we accept the "maybe-initialized" warnings, and when we don't. For example, we consider it unreliable for gcc versions < 4.9, but also if -O3 is enabled, or if optimizing for size. And then various kernel config options disabled it, because they know that they trigger that warning by confusing gcc sufficiently (ie PROFILE_ALL_BRANCHES). And now gcc-10 seems to be introducing a lot of those warnings too, so it falls under the same heading as 4.9 did. At the same time, we have a very straightforward way to _enable_ that warning when wanted: use "W=2" to enable more warnings. So stop playing these ad-hoc games, and just disable that warning by default, with the known and straight-forward "if you want to work on the extra compiler warnings, use W=123". Would it be great to have code that is always so obvious that it never confuses the compiler whether a variable is used initialized or not? Yes, it would. In a perfect world, the compilers would be smarter, and our source code would be simpler. That's currently not the world we live in, though. Signed-off-by: Linus Torvalds --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 402eef84c859..743647005f64 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -466,7 +466,6 @@ config PROFILE_ANNOTATED_BRANCHES config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING - imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. -- cgit v1.2.3 From ae24345da54e452880808b011fa2d8a0bbd191ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:58:59 -0700 Subject: bpf: Implement an interface to register bpf_iter targets The target can call bpf_iter_reg_target() to register itself. The needed information: target: target name seq_ops: the seq_file operations for the target init_seq_private target callback to initialize seq_priv during file open fini_seq_private target callback to clean up seq_priv during file release seq_priv_size: the private_data size needed by the seq_file operations The target name represents a target which provides a seq_ops for iterating objects. The target can provide two callback functions, init_seq_private and fini_seq_private, called during file open/release time. For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net name space needs to be setup properly during file open and released properly during file release. Function bpf_iter_unreg_target() is also implemented to unregister a particular target. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175859.2474669-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/bpf_iter.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f2d7be596966..6a8b0febd3f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c new file mode 100644 index 000000000000..5a8119d17d14 --- /dev/null +++ b/kernel/bpf/bpf_iter.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include + +struct bpf_iter_target_info { + struct list_head list; + const char *target; + const struct seq_operations *seq_ops; + bpf_iter_init_seq_priv_t init_seq_private; + bpf_iter_fini_seq_priv_t fini_seq_private; + u32 seq_priv_size; +}; + +static struct list_head targets = LIST_HEAD_INIT(targets); +static DEFINE_MUTEX(targets_mutex); + +int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +{ + struct bpf_iter_target_info *tinfo; + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + return -ENOMEM; + + tinfo->target = reg_info->target; + tinfo->seq_ops = reg_info->seq_ops; + tinfo->init_seq_private = reg_info->init_seq_private; + tinfo->fini_seq_private = reg_info->fini_seq_private; + tinfo->seq_priv_size = reg_info->seq_priv_size; + INIT_LIST_HEAD(&tinfo->list); + + mutex_lock(&targets_mutex); + list_add(&tinfo->list, &targets); + mutex_unlock(&targets_mutex); + + return 0; +} + +void bpf_iter_unreg_target(const char *target) +{ + struct bpf_iter_target_info *tinfo; + bool found = false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (!strcmp(target, tinfo->target)) { + list_del(&tinfo->list); + kfree(tinfo); + found = true; + break; + } + } + mutex_unlock(&targets_mutex); + + WARN_ON(found == false); +} -- cgit v1.2.3 From 15d83c4d7cef5c067a8b075ce59e97df4f60706e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:00 -0700 Subject: bpf: Allow loading of a bpf_iter program A bpf_iter program is a tracing program with attach type BPF_TRACE_ITER. The load attribute attach_btf_id is used by the verifier against a particular kernel function, which represents a target, e.g., __bpf_iter__bpf_map for target bpf_map which is implemented later. The program return value must be 0 or 1 for now. 0 : successful, except potential seq_file buffer overflow which is handled by seq_file reader. 1 : request to restart the same object In the future, other return values may be used for filtering or teminating the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175900.2474947-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 21 +++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5a8119d17d14..dec182d8395a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -12,6 +12,7 @@ struct bpf_iter_target_info { bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 btf_id; /* cached value */ }; static struct list_head targets = LIST_HEAD_INIT(targets); @@ -57,3 +58,38 @@ void bpf_iter_unreg_target(const char *target) WARN_ON(found == false); } + +static void cache_btf_id(struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + tinfo->btf_id = prog->aux->attach_btf_id; +} + +bool bpf_iter_prog_supported(struct bpf_prog *prog) +{ + const char *attach_fname = prog->aux->attach_func_name; + u32 prog_btf_id = prog->aux->attach_btf_id; + const char *prefix = BPF_ITER_FUNC_PREFIX; + struct bpf_iter_target_info *tinfo; + int prefix_len = strlen(prefix); + bool supported = false; + + if (strncmp(attach_fname, prefix, prefix_len)) + return false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { + supported = true; + break; + } + if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + cache_btf_id(tinfo, prog); + supported = true; + break; + } + } + mutex_unlock(&targets_mutex); + + return supported; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 70ad009577f8..d725ff7d11db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type != BPF_TRACE_ITER) + return 0; + break; default: return 0; } @@ -10481,6 +10485,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct btf_func_model fmodel; int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; @@ -10622,6 +10627,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_ITER: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + ret = btf_distill_func_proto(&env->log, btf, t, + tname, &fmodel); + return ret; default: if (!prog_extension) return -EINVAL; -- cgit v1.2.3 From de4e05cac46d206f9090051ef09930514bff73e4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:01 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_CREATE Given a bpf program, the step to create an anonymous bpf iterator is: - create a bpf_iter_link, which combines bpf program and the target. In the future, there could be more information recorded in the link. A link_fd will be returned to the user space. - create an anonymous bpf iterator with the given link_fd. The bpf_iter_link can be pinned to bpffs mount file system to create a file based bpf iterator as well. The benefit to use of bpf_iter_link: - using bpf link simplifies design and implementation as bpf link is used for other tracing bpf programs. - for file based bpf iterator, bpf_iter_link provides a standard way to replace underlying bpf programs. - for both anonymous and free based iterators, bpf link query capability can be leveraged. The patch added support of tracing/iter programs for BPF_LINK_CREATE. A new link type BPF_LINK_TYPE_ITER is added to facilitate link querying. Currently, only prog_id is needed, so there is no additional in-kernel show_fdinfo() and fill_link_info() hook is needed for BPF_LINK_TYPE_ITER link. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175901.2475084-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 14 ++++++++++++ 2 files changed, 76 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dec182d8395a..03f5832909db 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -15,6 +15,11 @@ struct bpf_iter_target_info { u32 btf_id; /* cached value */ }; +struct bpf_iter_link { + struct bpf_link link; + struct bpf_iter_target_info *tinfo; +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); @@ -93,3 +98,60 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } + +static void bpf_iter_link_release(struct bpf_link *link) +{ +} + +static void bpf_iter_link_dealloc(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + kfree(iter_link); +} + +static const struct bpf_link_ops bpf_iter_link_lops = { + .release = bpf_iter_link_release, + .dealloc = bpf_iter_link_dealloc, +}; + +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_iter_target_info *tinfo; + struct bpf_iter_link *link; + bool existed = false; + u32 prog_btf_id; + int err; + + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + prog_btf_id = prog->aux->attach_btf_id; + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog_btf_id) { + existed = true; + break; + } + } + mutex_unlock(&targets_mutex); + if (!existed) + return -ENOENT; + + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + link->tinfo = tinfo; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + return bpf_link_settle(&link_primer); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bb1ab7da6103..6ffe2d8fb6c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_ITER: + return BPF_PROG_TYPE_TRACING; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3729,6 +3731,15 @@ err_put: return err; } +static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + if (attr->link_create.attach_type == BPF_TRACE_ITER && + prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + + return -EINVAL; +} + #define BPF_LINK_CREATE_LAST_FIELD link_create.flags static int link_create(union bpf_attr *attr) { @@ -3765,6 +3776,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = tracing_bpf_link_attach(attr, prog); + break; default: ret = -EINVAL; } -- cgit v1.2.3 From 2057c92bc927f09b22f5609425eb37d7e782f484 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:02 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_UPDATE Added BPF_LINK_UPDATE support for tracing/iter programs. This way, a file based bpf iterator, which holds a reference to the link, can have its bpf program updated without creating new files. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175902.2475262-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 03f5832909db..0542a243b78c 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -23,6 +23,9 @@ struct bpf_iter_link { static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); +/* protect bpf_iter_link changes */ +static DEFINE_MUTEX(link_mutex); + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -111,9 +114,37 @@ static void bpf_iter_link_dealloc(struct bpf_link *link) kfree(iter_link); } +static int bpf_iter_link_replace(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + int ret = 0; + + mutex_lock(&link_mutex); + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + + if (link->prog->type != new_prog->type || + link->prog->expected_attach_type != new_prog->expected_attach_type || + link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { + ret = -EINVAL; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&link_mutex); + return ret; +} + static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, + .update_prog = bpf_iter_link_replace, }; int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) -- cgit v1.2.3 From fd4f12bc38c3ad9107169e7c9e6e7f81d93dda97 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:04 -0700 Subject: bpf: Implement bpf_seq_read() for bpf iterator bpf iterator uses seq_file to provide a lossless way to transfer data to user space. But we want to call bpf program after all objects have been traversed, and bpf program may write additional data to the seq_file buffer. The current seq_read() does not work for this use case. Besides allowing stop() function to write to the buffer, the bpf_seq_read() also fixed the buffer size to one page. If any single call of show() or stop() will emit data more than one page to cause overflow, -E2BIG error code will be returned to user space. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175904.2475468-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0542a243b78c..832973ee80fa 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -26,6 +26,129 @@ static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* bpf_seq_read, a customized and simpler version for bpf iterator. + * no_llseek is assumed for this file. + * The following are differences from seq_read(): + * . fixed buffer size (PAGE_SIZE) + * . assuming no_llseek + * . stop() may call bpf program, handling potential overflow there + */ +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + size_t n, offs, copied = 0; + int err = 0; + void *p; + + mutex_lock(&seq->lock); + + if (!seq->buf) { + seq->size = PAGE_SIZE; + seq->buf = kmalloc(seq->size, GFP_KERNEL); + if (!seq->buf) { + err = -ENOMEM; + goto done; + } + } + + if (seq->count) { + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf + seq->from, n); + if (err) { + err = -EFAULT; + goto done; + } + seq->count -= n; + seq->from += n; + copied = n; + goto done; + } + + seq->from = 0; + p = seq->op->start(seq, &seq->index); + if (!p) + goto stop; + if (IS_ERR(p)) { + err = PTR_ERR(p); + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + err = seq->op->show(seq, p); + if (err > 0) { + seq->count = 0; + } else if (err < 0 || seq_has_overflowed(seq)) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + while (1) { + loff_t pos = seq->index; + + offs = seq->count; + p = seq->op->next(seq, p, &seq->index); + if (pos == seq->index) { + pr_info_ratelimited("buggy seq_file .next function %ps " + "did not updated position index\n", + seq->op->next); + seq->index++; + } + + if (IS_ERR_OR_NULL(p)) + break; + + if (seq->count >= size) + break; + + err = seq->op->show(seq, p); + if (err > 0) { + seq->count = offs; + } else if (err < 0 || seq_has_overflowed(seq)) { + seq->count = offs; + if (offs == 0) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + goto done; + } + break; + } + } +stop: + offs = seq->count; + /* bpf program called if !p */ + seq->op->stop(seq, p); + if (!p && seq_has_overflowed(seq)) { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } + } + + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf, n); + if (err) { + err = -EFAULT; + goto done; + } + copied = n; + seq->count -= n; + seq->from = n; +done: + if (!copied) + copied = err; + else + *ppos += copied; + mutex_unlock(&seq->lock); + return copied; +} + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; -- cgit v1.2.3 From ac51d99bf81caac8d8881fe52098948110d0de68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:05 -0700 Subject: bpf: Create anonymous bpf iterator A new bpf command BPF_ITER_CREATE is added. The anonymous bpf iterator is seq_file based. The seq_file private data are referenced by targets. The bpf_iter infrastructure allocated additional space at seq_file->private before the space used by targets to store some meta data, e.g., prog: prog to run session_id: an unique id for each opened seq_file seq_num: how many times bpf programs are queried in this session done_stop: an internal state to decide whether bpf program should be called in seq_ops->stop() or not The seq_num will start from 0 for valid objects. The bpf program may see the same seq_num more than once if - seq_file buffer overflow happens and the same object is retried by bpf_seq_read(), or - the bpf program explicitly requests a retry of the same object Since module is not supported for bpf_iter, all target registeration happens at __init time, so there is no need to change bpf_iter_unreg_target() as it is used mostly in error path of the init function at which time no bpf iterators have been created yet. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175905.2475770-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 26 ++++++++++ 2 files changed, 155 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 832973ee80fa..e7129b57865f 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include +#include #include #include @@ -20,12 +21,24 @@ struct bpf_iter_link { struct bpf_iter_target_info *tinfo; }; +struct bpf_iter_priv_data { + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool done_stop; + u8 target_private[] __aligned(8); +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* incremented on every opened seq_file */ +static atomic64_t session_id; + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -149,6 +162,33 @@ done: return copied; } +static int iter_release(struct inode *inode, struct file *file) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + + seq = file->private_data; + if (!seq) + return 0; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + + if (iter_priv->tinfo->fini_seq_private) + iter_priv->tinfo->fini_seq_private(seq->private); + + bpf_prog_put(iter_priv->prog); + seq->private = iter_priv; + + return seq_release_private(inode, file); +} + +static const struct file_operations bpf_iter_fops = { + .llseek = no_llseek, + .read = bpf_seq_read, + .release = iter_release, +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -309,3 +349,92 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return bpf_link_settle(&link_primer); } + +static void init_seq_meta(struct bpf_iter_priv_data *priv_data, + struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + priv_data->tinfo = tinfo; + priv_data->prog = prog; + priv_data->session_id = atomic64_inc_return(&session_id); + priv_data->seq_num = 0; + priv_data->done_stop = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +{ + struct bpf_iter_priv_data *priv_data; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + struct seq_file *seq; + int err = 0; + + mutex_lock(&link_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&link_mutex); + + tinfo = link->tinfo; + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + + tinfo->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + if (!priv_data) { + err = -ENOMEM; + goto release_prog; + } + + if (tinfo->init_seq_private) { + err = tinfo->init_seq_private(priv_data->target_private); + if (err) + goto release_seq_file; + } + + init_seq_meta(priv_data, tinfo, prog); + seq = file->private_data; + seq->private = priv_data->target_private; + + return 0; + +release_seq_file: + seq_release_private(file->f_inode, file); + file->private_data = NULL; +release_prog: + bpf_prog_put(prog); + return err; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct file *file; + unsigned int flags; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + flags = O_RDONLY | O_CLOEXEC; + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + err = prepare_seq_file(file, + container_of(link, struct bpf_iter_link, link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ffe2d8fb6c7..a293e88ee01a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3941,6 +3941,29 @@ static int bpf_enable_stats(union bpf_attr *attr) return -EINVAL; } +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4068,6 +4091,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ENABLE_STATS: err = bpf_enable_stats(&attr); break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 367ec3e4834cbd611401c2c40a23c22c825474f1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:06 -0700 Subject: bpf: Create file bpf iterator To produce a file bpf iterator, the fd must be corresponding to a link_fd assocciated with a trace/iter program. When the pinned file is opened, a seq_file will be generated. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175906.2475893-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 17 ++++++++++++++++- kernel/bpf/inode.c | 5 ++++- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index e7129b57865f..090f09b0eacb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -39,6 +39,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -162,6 +164,13 @@ done: return copied; } +static int iter_open(struct inode *inode, struct file *file) +{ + struct bpf_iter_link *link = inode->i_private; + + return prepare_seq_file(file, link); +} + static int iter_release(struct inode *inode, struct file *file) { struct bpf_iter_priv_data *iter_priv; @@ -183,7 +192,8 @@ static int iter_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static const struct file_operations bpf_iter_fops = { +const struct file_operations bpf_iter_fops = { + .open = iter_open, .llseek = no_llseek, .read = bpf_seq_read, .release = iter_release, @@ -310,6 +320,11 @@ static const struct bpf_link_ops bpf_iter_link_lops = { .update_prog = bpf_iter_link_replace, }; +bool bpf_link_is_iter(struct bpf_link *link) +{ + return link->ops == &bpf_iter_link_lops; +} + int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 95087d9f4ed3..fb878ba3f22f 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) { + struct bpf_link *link = arg; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, - &bpffs_obj_fops); + bpf_link_is_iter(link) ? + &bpf_iter_fops : &bpffs_obj_fops); } static struct dentry * -- cgit v1.2.3 From e5158d987b72c3f318b4b52a01ac6f3997bd0c00 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:07 -0700 Subject: bpf: Implement common macros/helpers for target iterators Macro DEFINE_BPF_ITER_FUNC is implemented so target can define an init function to capture the BTF type which represents the target. The bpf_iter_meta is a structure holding meta data, common to all targets in the bpf program. Additional marker functions are called before or after bpf_seq_read() show()/next()/stop() callback functions to help calculate precise seq_num and whether call bpf_prog inside stop(). Two functions, bpf_iter_get_info() and bpf_iter_run_prog(), are implemented so target can get needed information from bpf_iter infrastructure and can run the program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175907.2475956-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 090f09b0eacb..30efd15cd4a0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -41,6 +41,33 @@ static atomic64_t session_id; static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static void bpf_iter_inc_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num++; +} + +static void bpf_iter_dec_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num--; +} + +static void bpf_iter_done_stop(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->done_stop = true; +} + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -93,6 +120,10 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, err = seq->op->show(seq, p); if (err > 0) { + /* object is skipped, decrease seq_num, so next + * valid object can reuse the same seq_num. + */ + bpf_iter_dec_seq_num(seq); seq->count = 0; } else if (err < 0 || seq_has_overflowed(seq)) { if (!err) @@ -117,11 +148,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, if (IS_ERR_OR_NULL(p)) break; + /* got a valid next object, increase seq_num */ + bpf_iter_inc_seq_num(seq); + if (seq->count >= size) break; err = seq->op->show(seq, p); if (err > 0) { + bpf_iter_dec_seq_num(seq); seq->count = offs; } else if (err < 0 || seq_has_overflowed(seq)) { seq->count = offs; @@ -138,11 +173,15 @@ stop: offs = seq->count; /* bpf program called if !p */ seq->op->stop(seq, p); - if (!p && seq_has_overflowed(seq)) { - seq->count = offs; - if (offs == 0) { - err = -E2BIG; - goto done; + if (!p) { + if (!seq_has_overflowed(seq)) { + bpf_iter_done_stop(seq); + } else { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } } } @@ -453,3 +492,39 @@ free_fd: put_unused_fd(fd); return err; } + +struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + void *seq_priv; + + seq = meta->seq; + if (seq->file->f_op != &bpf_iter_fops) + return NULL; + + seq_priv = seq->private; + iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, + target_private); + + if (in_stop && iter_priv->done_stop) + return NULL; + + meta->session_id = iter_priv->session_id; + meta->seq_num = iter_priv->seq_num; + + return iter_priv->prog; +} + +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) +{ + int ret; + + rcu_read_lock(); + migrate_disable(); + ret = BPF_PROG_RUN(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + + return ret == 0 ? 0 : -EAGAIN; +} -- cgit v1.2.3 From 6086d29def80edd78f9832ea6eafa74e3818f6a7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:09 -0700 Subject: bpf: Add bpf_map iterator Implement seq_file operations to traverse all bpf_maps. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175909.2476096-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/map_iter.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 19 ++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/map_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a8b0febd3f6..b2b5eefc5254 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c new file mode 100644 index 000000000000..8162e0c00b9f --- /dev/null +++ b/kernel/bpf/map_iter.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include + +struct bpf_iter_seq_map_info { + u32 mid; +}; + +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + ++*pos; + return map; +} + +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + ++*pos; + ++info->mid; + bpf_map_put((struct bpf_map *)v); + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + return map; +} + +struct bpf_iter__bpf_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); +}; + +DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map) + +static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_map ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.map = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_map_seq_show(seq, v, false); +} + +static void bpf_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_map_seq_show(seq, v, true); + else + bpf_map_put((struct bpf_map *)v); +} + +static const struct seq_operations bpf_map_seq_ops = { + .start = bpf_map_seq_start, + .next = bpf_map_seq_next, + .stop = bpf_map_seq_stop, + .show = bpf_map_seq_show, +}; + +static int __init bpf_map_iter_init(void) +{ + struct bpf_iter_reg reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + }; + + return bpf_iter_reg_target(®_info); +} + +late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a293e88ee01a..de2a75500233 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2934,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +struct bpf_map *bpf_map_get_curr_or_next(u32 *id) +{ + struct bpf_map *map; + + spin_lock_bh(&map_idr_lock); +again: + map = idr_get_next(&map_idr, id); + if (map) { + map = __bpf_map_inc_not_zero(map, false); + if (IS_ERR(map)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&map_idr_lock); + + return map; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) -- cgit v1.2.3 From eaaacd23910f2d7c4b22d43f591002cc217d294b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:11 -0700 Subject: bpf: Add task and task/file iterator targets Only the tasks belonging to "current" pid namespace are enumerated. For task/file target, the bpf program will have access to struct task_struct *task u32 fd struct file *file where fd/file is an open file for the task. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175911.2476407-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/task_iter.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/task_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index b2b5eefc5254..37b2d8620153 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c new file mode 100644 index 000000000000..aeed662d8451 --- /dev/null +++ b/kernel/bpf/task_iter.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include +#include +#include +#include + +struct bpf_iter_seq_task_common { + struct pid_namespace *ns; +}; + +struct bpf_iter_seq_task_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + u32 tid; +}; + +static struct task_struct *task_seq_get_next(struct pid_namespace *ns, + u32 *tid) +{ + struct task_struct *task = NULL; + struct pid *pid; + + rcu_read_lock(); + pid = idr_get_next(&ns->idr, tid); + if (pid) + task = get_pid_task(pid, PIDTYPE_PID); + rcu_read_unlock(); + + return task; +} + +static void *task_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + ++*pos; + return task; +} + +static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + ++*pos; + ++info->tid; + put_task_struct((struct task_struct *)v); + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + return task; +} + +struct bpf_iter__task { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); +}; + +DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) + +static int __task_seq_show(struct seq_file *seq, struct task_struct *task, + bool in_stop) +{ + struct bpf_iter_meta meta; + struct bpf_iter__task ctx; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + meta.seq = seq; + ctx.meta = &meta; + ctx.task = task; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_seq_show(struct seq_file *seq, void *v) +{ + return __task_seq_show(seq, v, false); +} + +static void task_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__task_seq_show(seq, v, true); + else + put_task_struct((struct task_struct *)v); +} + +static const struct seq_operations task_seq_ops = { + .start = task_seq_start, + .next = task_seq_next, + .stop = task_seq_stop, + .show = task_seq_show, +}; + +struct bpf_iter_seq_task_file_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct files_struct *files; + u32 tid; + u32 fd; +}; + +static struct file * +task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, + struct task_struct **task, struct files_struct **fstruct) +{ + struct pid_namespace *ns = info->common.ns; + u32 curr_tid = info->tid, max_fds; + struct files_struct *curr_files; + struct task_struct *curr_task; + int curr_fd = info->fd; + + /* If this function returns a non-NULL file object, + * it held a reference to the task/files_struct/file. + * Otherwise, it does not hold any reference. + */ +again: + if (*task) { + curr_task = *task; + curr_files = *fstruct; + curr_fd = info->fd; + } else { + curr_task = task_seq_get_next(ns, &curr_tid); + if (!curr_task) + return NULL; + + curr_files = get_files_struct(curr_task); + if (!curr_files) { + put_task_struct(curr_task); + curr_tid = ++(info->tid); + info->fd = 0; + goto again; + } + + /* set *fstruct, *task and info->tid */ + *fstruct = curr_files; + *task = curr_task; + if (curr_tid == info->tid) { + curr_fd = info->fd; + } else { + info->tid = curr_tid; + curr_fd = 0; + } + } + + rcu_read_lock(); + max_fds = files_fdtable(curr_files)->max_fds; + for (; curr_fd < max_fds; curr_fd++) { + struct file *f; + + f = fcheck_files(curr_files, curr_fd); + if (!f) + continue; + + /* set info->fd */ + info->fd = curr_fd; + get_file(f); + rcu_read_unlock(); + return f; + } + + /* the current task is done, go to the next task */ + rcu_read_unlock(); + put_files_struct(curr_files); + put_task_struct(curr_task); + *task = NULL; + *fstruct = NULL; + info->fd = 0; + curr_tid = ++(info->tid); + goto again; +} + +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = NULL; + struct task_struct *task = NULL; + struct file *file; + + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + ++*pos; + info->task = task; + info->files = files; + + return file; +} + +static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = info->files; + struct task_struct *task = info->task; + struct file *file; + + ++*pos; + ++info->fd; + fput((struct file *)v); + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + info->task = task; + info->files = files; + + return file; +} + +struct bpf_iter__task_file { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + u32 fd __aligned(8); + __bpf_md_ptr(struct file *, file); +}; + +DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, + struct task_struct *task, u32 fd, + struct file *file) + +static int __task_file_seq_show(struct seq_file *seq, struct file *file, + bool in_stop) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct bpf_iter__task_file ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.fd = info->fd; + ctx.file = file; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_file_seq_show(struct seq_file *seq, void *v) +{ + return __task_file_seq_show(seq, v, false); +} + +static void task_file_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + + if (!v) { + (void)__task_file_seq_show(seq, v, true); + } else { + fput((struct file *)v); + put_files_struct(info->files); + put_task_struct(info->task); + info->files = NULL; + info->task = NULL; + } +} + +static int init_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + common->ns = get_pid_ns(task_active_pid_ns(current)); + return 0; +} + +static void fini_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + put_pid_ns(common->ns); +} + +static const struct seq_operations task_file_seq_ops = { + .start = task_file_seq_start, + .next = task_file_seq_next, + .stop = task_file_seq_stop, + .show = task_file_seq_show, +}; + +static int __init task_iter_init(void) +{ + struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + }; + struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + }; + int ret; + + ret = bpf_iter_reg_target(&task_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&task_file_reg_info); +} +late_initcall(task_iter_init); -- cgit v1.2.3 From b121b341e5983bdccf7a5d6cf9236a45c965a31f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:12 -0700 Subject: bpf: Add PTR_TO_BTF_ID_OR_NULL support Add bpf_reg_type PTR_TO_BTF_ID_OR_NULL support. For tracing/iter program, the bpf program context definition, e.g., for previous bpf_map target, looks like struct bpf_iter__bpf_map { struct bpf_iter_meta *meta; struct bpf_map *map; }; The kernel guarantees that meta is not NULL, but map pointer maybe NULL. The NULL map indicates that all objects have been traversed, so bpf program can take proper action, e.g., do final aggregation and/or send final report to user space. Add btf_id_or_null_non0_off to prog->aux structure, to indicate that if the context access offset is not 0, set to PTR_TO_BTF_ID_OR_NULL instead of PTR_TO_BTF_ID. This bit is set for tracing/iter program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175912.2476576-1-yhs@fb.com --- kernel/bpf/btf.c | 5 ++++- kernel/bpf/verifier.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2cfba89a8e1..c490fbde22d4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3790,7 +3790,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - info->reg_type = PTR_TO_BTF_ID; + if (off != 0 && prog->aux->btf_id_or_null_non0_off) + info->reg_type = PTR_TO_BTF_ID_OR_NULL; + else + info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d725ff7d11db..36b2a38a06fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_BTF_ID_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -483,6 +484,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", }; static char slot_type_char[] = { @@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID) + if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return true; default: return false; @@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID) + if (reg_type == PTR_TO_BTF_ID || + reg_type == PTR_TO_BTF_ID_OR_NULL) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -6572,6 +6576,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCK_COMMON; } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { reg->type = PTR_TO_TCP_SOCK; + } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { + reg->type = PTR_TO_BTF_ID; } if (is_null) { /* We don't need id and ref_obj_id from this point @@ -8429,6 +8435,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -10640,6 +10647,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; + prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; -- cgit v1.2.3 From 492e639f0c222784e2e0f121966375f641c61b15 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:14 -0700 Subject: bpf: Add bpf_seq_printf and bpf_seq_write helpers Two helpers bpf_seq_printf and bpf_seq_write, are added for writing data to the seq_file buffer. bpf_seq_printf supports common format string flag/width/type fields so at least I can get identical results for netlink and ipv6_route targets. For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW specifically indicates a write failure due to overflow, which means the object will be repeated in the next bpf invocation if object collection stays the same. Note that if the object collection is changed, depending how collection traversal is done, even if the object still in the collection, it may not be visited. For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to read kernel memory. Reading kernel memory may fail in the following two cases: - invalid kernel address, or - valid kernel address but requiring a major fault If reading kernel memory failed, the %s string will be an empty string and %p{i,I}{4,6} will be all 0. Not returning error to bpf program is consistent with what bpf_trace_printk() does for now. bpf_seq_printf may return -EBUSY meaning that internal percpu buffer for memory copy of strings or other pointees is not available. Bpf program can return 1 to indicate it wants the same object to be repeated. Right now, this should not happen on no-RT kernels since migrate_disable(), which guards bpf prog call, calls preempt_disable(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e875c95d3ced..d961428fb5b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +#define MAX_SEQ_PRINTF_VARARGS 12 +#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 +#define MAX_SEQ_PRINTF_STR_LEN 128 + +struct bpf_seq_printf_buf { + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); +static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) +{ + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; + int i, buf_used, copy_size, num_args; + u64 params[MAX_SEQ_PRINTF_VARARGS]; + struct bpf_seq_printf_buf *bufs; + const u64 *args = data; + + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); + if (WARN_ON_ONCE(buf_used > 1)) { + err = -EBUSY; + goto out; + } + + bufs = this_cpu_ptr(&bpf_seq_printf_buf); + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + goto out; + + if (data_len & 7) + goto out; + + for (i = 0; i < fmt_size; i++) { + if (fmt[i] == '%') { + if (fmt[i + 1] == '%') + i++; + else if (!data || !data_len) + goto out; + } + } + + num_args = data_len / 8; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + /* only printable ascii for now. */ + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { + err = -E2BIG; + goto out; + } + + if (fmt_cnt >= num_args) { + err = -EINVAL; + goto out; + } + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + + /* skip optional "[0 +-][num]" width formating field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 's') { + /* try our best to copy */ + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); + if (err < 0) + bufs->buf[memcpy_cnt][0] = '\0'; + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'p') { + if (fmt[i + 1] == 0 || + fmt[i + 1] == 'K' || + fmt[i + 1] == 'x') { + /* just kernel pointers */ + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { + err = -EINVAL; + goto out; + } + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { + err = -EINVAL; + goto out; + } + + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + + err = probe_kernel_read(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + copy_size); + if (err < 0) + memset(bufs->buf[memcpy_cnt], 0, copy_size); + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + i += 2; + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'l') { + i++; + if (fmt[i] == 'l') + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') { + err = -EINVAL; + goto out; + } + + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + } + + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + * all of them to seq_printf(). + */ + seq_printf(m, fmt, params[0], params[1], params[2], params[3], + params[4], params[5], params[6], params[7], params[8], + params[9], params[10], params[11]); + + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; +out: + this_cpu_dec(bpf_seq_printf_buf_used); + return err; +} + +static int bpf_seq_printf_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_printf_btf_ids, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +static int bpf_seq_write_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_write_btf_ids, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; #endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; default: return raw_tp_prog_func_proto(func_id, prog); } -- cgit v1.2.3 From 1d68f22b3d53d368d5cc8d09de890250cae5c945 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:15 -0700 Subject: bpf: Handle spilled PTR_TO_BTF_ID properly when checking stack_boundary This specifically to handle the case like below: // ptr below is a socket ptr identified by PTR_TO_BTF_ID u64 param[2] = { ptr, val }; bpf_seq_printf(seq, fmt, sizeof(fmt), param, sizeof(param)); In this case, the 16 bytes stack for "param" contains: 8 bytes for ptr with spilled PTR_TO_BTF_ID 8 bytes for val as STACK_MISC The current verifier will complain the ptr should not be visible to the helper. ... 16: (7b) *(u64 *)(r10 -64) = r2 18: (7b) *(u64 *)(r10 -56) = r1 19: (bf) r4 = r10 ; 20: (07) r4 += -64 ; BPF_SEQ_PRINTF(seq, fmt1, (long)s, s->sk_protocol); 21: (bf) r1 = r6 22: (18) r2 = 0xffffa8d00018605a 24: (b4) w3 = 10 25: (b4) w5 = 16 26: (85) call bpf_seq_printf#125 R0=inv(id=0) R1_w=ptr_seq_file(id=0,off=0,imm=0) R2_w=map_value(id=0,off=90,ks=4,vs=144,imm=0) R3_w=inv10 R4_w=fp-64 R5_w=inv16 R6=ptr_seq_file(id=0,off=0,imm=0) R7=ptr_netlink_sock(id=0,off=0,imm=0) R10=fp0 fp-56_w=mmmmmmmm fp-64_w=ptr_ last_idx 26 first_idx 13 regs=8 stack=0 before 25: (b4) w5 = 16 regs=8 stack=0 before 24: (b4) w3 = 10 invalid indirect read from stack off -64+0 size 16 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175915.2476783-1-yhs@fb.com --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36b2a38a06fe..2a1826c76bb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3494,6 +3494,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + goto mark; + if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); -- cgit v1.2.3 From 9c5f8a1008a121e4c6b24af211034e24b0b63081 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:16 -0700 Subject: bpf: Support variable length array in tracing programs In /proc/net/ipv6_route, we have struct fib6_info { struct fib6_table *fib6_table; ... struct fib6_nh fib6_nh[0]; } struct fib6_nh { struct fib_nh_common nh_common; struct rt6_info **rt6i_pcpu; struct rt6_exception_bucket *rt6i_exception_bucket; }; struct fib_nh_common { ... u8 nhc_gw_family; ... } The access: struct fib6_nh *fib6_nh = &rt->fib6_nh; ... fib6_nh->nh_common.nhc_gw_family ... This patch ensures such an access is handled properly. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175916.2476853-1-yhs@fb.com --- kernel/bpf/btf.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c490fbde22d4..dcd233139294 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3833,6 +3833,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname; + u32 vlen; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3841,7 +3842,43 @@ again: return -EINVAL; } + vlen = btf_type_vlen(t); if (off + size > t->size) { + /* If the last element is a variable size array, we may + * need to relax the rule. + */ + struct btf_array *array_elem; + + if (vlen == 0) + goto error; + + member = btf_type_member(t) + vlen - 1; + mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, + NULL); + if (!btf_type_is_array(mtype)) + goto error; + + array_elem = (struct btf_array *)(mtype + 1); + if (array_elem->nelems != 0) + goto error; + + moff = btf_member_bit_offset(t, member) / 8; + if (off < moff) + goto error; + + /* Only allow structure for now, can be relaxed for + * other types later. + */ + elem_type = btf_type_skip_modifiers(btf_vmlinux, + array_elem->type, NULL); + if (!btf_type_is_struct(elem_type)) + goto error; + + off = (off - moff) % elem_type->size; + return btf_struct_access(log, elem_type, off, size, atype, + next_btf_id); + +error: bpf_log(log, "access beyond struct %s at off %u size %u\n", tname, off, size); return -EACCES; -- cgit v1.2.3 From b6522fa409cfafbc3968679b09e4028f0609f2b9 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Sat, 11 Apr 2020 21:06:19 +0800 Subject: parisc: add sysctl file interface panic_on_stackoverflow The variable sysctl_panic_on_stackoverflow is used in arch/parisc/kernel/irq.c and arch/x86/kernel/irq_32.c, but the sysctl file interface panic_on_stackoverflow only exists on x86. Add sysctl file interface panic_on_stackoverflow for parisc Signed-off-by: Xiaoming Ni Reviewed-by: Luis Chamberlain Signed-off-by: Helge Deller --- kernel/sysctl.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..b9ff323e1d26 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -994,30 +994,32 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) + +#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ + defined(CONFIG_DEBUG_STACKOVERFLOW) { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#if defined(CONFIG_X86) { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_DEBUG_STACKOVERFLOW { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#endif { .procname = "bootloader_type", .data = &bootloader_type, -- cgit v1.2.3 From a4ae16f65c335f8be58b67b78628c788c4b325a5 Mon Sep 17 00:00:00 2001 From: Samuel Zou Date: Sat, 9 May 2020 09:16:41 +0800 Subject: livepatch: Make klp_apply_object_relocs static Fix the following sparse warning: kernel/livepatch/core.c:748:5: warning: symbol 'klp_apply_object_relocs' was not declared. The klp_apply_object_relocs() has only one call site within core.c; it should be static Fixes: 7c8e2bdd5f0d ("livepatch: Apply vmlinux-specific KLP relocations early") Reported-by: Hulk Robot Signed-off-by: Samuel Zou Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 96d2da14eb0d..f76fdb925532 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -745,7 +745,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } -int klp_apply_object_relocs(struct klp_patch *patch, struct klp_object *obj) +static int klp_apply_object_relocs(struct klp_patch *patch, + struct klp_object *obj) { int i, ret; struct klp_modinfo *info = patch->mod->klp_info; -- cgit v1.2.3 From b92b36eadf4d7fa4a34f048c2a3bb61a735a885e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 May 2020 18:07:40 +0300 Subject: workqueue: Fix an use after free in init_rescuer() We need to preserve error code before freeing "rescuer". Fixes: f187b6974f6df ("workqueue: Use IS_ERR and PTR_ERR instead of PTR_ERR_OR_ZERO.") Signed-off-by: Dan Carpenter Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ddf0537dce14..10ed8d761e0b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4197,6 +4197,7 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; + int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; @@ -4208,8 +4209,9 @@ static int init_rescuer(struct workqueue_struct *wq) rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); if (IS_ERR(rescuer->task)) { + ret = PTR_ERR(rescuer->task); kfree(rescuer); - return PTR_ERR(rescuer->task); + return ret; } wq->rescuer = rescuer; -- cgit v1.2.3 From 385bbf7b119a4feb6d6bcf3586f1bb1dd9c5b0a0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:50:57 -0500 Subject: bpf, libbpf: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200507185057.GA13981@embeddedor --- kernel/bpf/queue_stack_maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f697647ceb54..30e1373fd437 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -19,7 +19,7 @@ struct bpf_queue_stack { u32 head, tail; u32 size; /* max_entries + 1 */ - char elements[0] __aligned(8); + char elements[] __aligned(8); }; static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) -- cgit v1.2.3 From 8b1fac2e73e84ef0d6391051880a8e1d7044c847 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 10 May 2020 11:35:10 -0400 Subject: tracing: Wait for preempt irq delay thread to execute A bug report was posted that running the preempt irq delay module on a slow machine, and removing it quickly could lead to the thread created by the modlue to execute after the module is removed, and this could cause the kernel to crash. The fix for this was to call kthread_stop() after creating the thread to make sure it finishes before allowing the module to be removed. Now this caused the opposite problem on fast machines. What now happens is the kthread_stop() can cause the kthread never to execute and the test never to run. To fix this, add a completion and wait for the kthread to execute, then wait for it to end. This issue caused the ftracetest selftests to fail on the preemptirq tests. Link: https://lore.kernel.org/r/20200510114210.15d9e4af@oasis.local.home Cc: stable@vger.kernel.org Fixes: d16a8c31077e ("tracing: Wait for preempt irq delay thread to finish") Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/preemptirq_delay_test.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index c4c86de63cf9..312d1a0ca3b6 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -16,6 +16,7 @@ #include #include #include +#include static ulong delay = 100; static char test_mode[12] = "irq"; @@ -28,6 +29,8 @@ MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); +static struct completion done; + #define MIN(x, y) ((x) < (y) ? (x) : (y)) static void busy_wait(ulong time) @@ -114,6 +117,8 @@ static int preemptirq_delay_run(void *data) for (i = 0; i < s; i++) (testfuncs[i])(i); + complete(&done); + set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); @@ -128,15 +133,18 @@ static int preemptirq_delay_run(void *data) static int preemptirq_run_test(void) { struct task_struct *task; - char task_name[50]; + init_completion(&done); + snprintf(task_name, sizeof(task_name), "%s_test", test_mode); task = kthread_run(preemptirq_delay_run, NULL, task_name); if (IS_ERR(task)) return PTR_ERR(task); - if (task) + if (task) { + wait_for_completion(&done); kthread_stop(task); + } return 0; } -- cgit v1.2.3 From 90b5363acd4739769c3f38c1aff16171bd133e8c Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Fri, 27 Mar 2020 11:44:56 +0100 Subject: sched: Clean up scheduler_ipi() The scheduler IPI has grown weird and wonderful over the years, time for spring cleaning. Move all the non-trivial stuff out of it and into a regular smp function call IPI. This then reduces the schedule_ipi() to most of it's former NOP glory and ensures to keep the interrupt vector lean and mean. Aside of that avoiding the full irq_enter() in the x86 IPI implementation is incorrect as scheduler_ipi() can be instrumented. To work around that scheduler_ipi() had an irq_enter/exit() hack when heavy work was pending. This is gone now. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134058.361859938@linutronix.de --- kernel/sched/core.c | 64 ++++++++++++++++++++++++---------------------------- kernel/sched/fair.c | 5 ++-- kernel/sched/sched.h | 6 +++-- 3 files changed, 36 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b58efb1156eb..cd2070d6f1e4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -219,6 +219,13 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } +static inline void +rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) +{ + csd->flags = 0; + csd->func = func; + csd->info = rq; +} #ifdef CONFIG_SCHED_HRTICK /* @@ -314,16 +321,14 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED_HARD); } + #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; + rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } @@ -650,6 +655,16 @@ static inline bool got_nohz_idle_kick(void) return false; } +static void nohz_csd_func(void *info) +{ + struct rq *rq = info; + + if (got_nohz_idle_kick()) { + rq->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } +} + #else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) @@ -2292,6 +2307,11 @@ void sched_ttwu_pending(void) rq_unlock_irqrestore(rq, &rf); } +static void wake_csd_func(void *info) +{ + sched_ttwu_pending(); +} + void scheduler_ipi(void) { /* @@ -2300,34 +2320,6 @@ void scheduler_ipi(void) * this IPI. */ preempt_fold_need_resched(); - - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); - sched_ttwu_pending(); - - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) @@ -2336,9 +2328,9 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (llist_add(&p->wake_entry, &rq->wake_list)) { if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); + smp_call_function_single_async(cpu, &rq->wake_csd); else trace_sched_wake_idle_without_ipi(cpu); } @@ -6693,12 +6685,16 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + rq_csd_init(rq, &rq->wake_csd, wake_csd_func); + INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); + + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 46b7bd41573f..6b7f1474e2d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10000,12 +10000,11 @@ static void kick_ilb(unsigned int flags) return; /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which + * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ - smp_send_reschedule(ilb_cpu); + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 978c6fac8cb8..21416b30c520 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -889,9 +889,10 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; + call_single_data_t nohz_csd; #endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; - atomic_t nohz_flags; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ unsigned long nr_load_updates; @@ -978,7 +979,7 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1020,6 +1021,7 @@ struct rq { #endif #ifdef CONFIG_SMP + call_single_data_t wake_csd; struct llist_head wake_list; #endif -- cgit v1.2.3 From 2a0a24ebb499c9d499eea948d3fc108f936e36d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 27 Mar 2020 12:42:00 +0100 Subject: sched: Make scheduler_ipi inline Now that the scheduler IPI is trivial and simple again there is no point to have the little function out of line. This simplifies the effort of constraining the instrumentation nicely. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134058.453581595@linutronix.de --- kernel/sched/core.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cd2070d6f1e4..74fb89b5ce3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2312,16 +2312,6 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } -void scheduler_ipi(void) -{ - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); -} - static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); -- cgit v1.2.3 From 4fdd88877e5259dcc5e504dca449acc0324d71b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Mar 2020 23:49:36 +0900 Subject: kprobes: Lock kprobe_mutex while showing kprobe_blacklist Lock kprobe_mutex while showing kprobe_blacklist to prevent updating the kprobe_blacklist. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.571125195@linutronix.de --- kernel/kprobes.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..570d60827656 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2420,6 +2420,7 @@ static const struct file_operations debugfs_kprobes_operations = { /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&kprobe_mutex); return seq_list_start(&kprobe_blacklist, *pos); } @@ -2446,10 +2447,15 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) return 0; } +static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) +{ + mutex_unlock(&kprobe_mutex); +} + static const struct seq_operations kprobe_blacklist_seq_ops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, - .stop = kprobe_seq_stop, /* Reuse void function */ + .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; -- cgit v1.2.3 From 1e6769b0aece51ea7a3dc3117c37d4a5669e4a21 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Mar 2020 23:49:48 +0900 Subject: kprobes: Support __kprobes blacklist in modules Support __kprobes attribute for blacklist functions in modules. The __kprobes attribute functions are stored in .kprobes.text section. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.678201813@linutronix.de --- kernel/kprobes.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kernel/module.c | 4 ++++ 2 files changed, 46 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 570d60827656..b7549992b9bd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2179,6 +2179,19 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2215,6 +2228,28 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +static void add_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_add_area_blacklist(start, end); + } +} + +static void remove_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_remove_area_blacklist(start, end); + } +} + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2225,6 +2260,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) { + mutex_lock(&kprobe_mutex); + add_module_kprobe_blacklist(mod); + mutex_unlock(&kprobe_mutex); + } if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2255,6 +2295,8 @@ static int kprobes_module_callback(struct notifier_block *nb, kill_kprobe(p); } } + if (val == MODULE_STATE_GOING) + remove_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..978f3fa40850 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3193,6 +3193,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->ei_funcs = section_objs(info, "_error_injection_whitelist", sizeof(*mod->ei_funcs), &mod->num_ei_funcs); +#endif +#ifdef CONFIG_KPROBES + mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, + &mod->kprobes_text_size); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit v1.2.3 From 16db6264c93d2d7df9eb8be5d9eb717ab30105fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Mar 2020 23:50:00 +0900 Subject: kprobes: Support NOKPROBE_SYMBOL() in modules Support NOKPROBE_SYMBOL() in modules. NOKPROBE_SYMBOL() records only symbol address in "_kprobe_blacklist" section in the module. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.771170126@linutronix.de --- kernel/kprobes.c | 17 +++++++++++++++++ kernel/module.c | 3 +++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b7549992b9bd..9eb5acf0a9f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2192,6 +2192,11 @@ static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) } } +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2231,6 +2236,12 @@ static int __init populate_kprobe_blacklist(unsigned long *start, static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]); + } start = (unsigned long)mod->kprobes_text_start; if (start) { @@ -2242,6 +2253,12 @@ static void add_module_kprobe_blacklist(struct module *mod) static void remove_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]); + } start = (unsigned long)mod->kprobes_text_start; if (start) { diff --git a/kernel/module.c b/kernel/module.c index 978f3fa40850..faf733789560 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3197,6 +3197,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) #ifdef CONFIG_KPROBES mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, &mod->kprobes_text_size); + mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", + sizeof(unsigned long), + &mod->num_kprobe_blacklist); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit v1.2.3 From 59566b0b622e3e6ea928c0b8cac8a5601b00b383 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 30 Apr 2020 20:21:47 -0400 Subject: x86/ftrace: Have ftrace trampolines turn read-only at the end of system boot up Booting one of my machines, it triggered the following crash: Kernel/User page tables isolation: enabled ftrace: allocating 36577 entries in 143 pages Starting tracer 'function' BUG: unable to handle page fault for address: ffffffffa000005c #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 2014067 P4D 2014067 PUD 2015063 PMD 7b253067 PTE 7b252061 Oops: 0003 [#1] PREEMPT SMP PTI CPU: 0 PID: 0 Comm: swapper Not tainted 5.4.0-test+ #24 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 RIP: 0010:text_poke_early+0x4a/0x58 Code: 34 24 48 89 54 24 08 e8 bf 72 0b 00 48 8b 34 24 48 8b 4c 24 08 84 c0 74 0b 48 89 df f3 a4 48 83 c4 10 5b c3 9c 58 fa 48 89 df a4 50 9d 48 83 c4 10 5b e9 d6 f9 ff ff 0 41 57 49 RSP: 0000:ffffffff82003d38 EFLAGS: 00010046 RAX: 0000000000000046 RBX: ffffffffa000005c RCX: 0000000000000005 RDX: 0000000000000005 RSI: ffffffff825b9a90 RDI: ffffffffa000005c RBP: ffffffffa000005c R08: 0000000000000000 R09: ffffffff8206e6e0 R10: ffff88807b01f4c0 R11: ffffffff8176c106 R12: ffffffff8206e6e0 R13: ffffffff824f2440 R14: 0000000000000000 R15: ffffffff8206eac0 FS: 0000000000000000(0000) GS:ffff88807d400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffa000005c CR3: 0000000002012000 CR4: 00000000000006b0 Call Trace: text_poke_bp+0x27/0x64 ? mutex_lock+0x36/0x5d arch_ftrace_update_trampoline+0x287/0x2d5 ? ftrace_replace_code+0x14b/0x160 ? ftrace_update_ftrace_func+0x65/0x6c __register_ftrace_function+0x6d/0x81 ftrace_startup+0x23/0xc1 register_ftrace_function+0x20/0x37 func_set_flag+0x59/0x77 __set_tracer_option.isra.19+0x20/0x3e trace_set_options+0xd6/0x13e apply_trace_boot_options+0x44/0x6d register_tracer+0x19e/0x1ac early_trace_init+0x21b/0x2c9 start_kernel+0x241/0x518 ? load_ucode_intel_bsp+0x21/0x52 secondary_startup_64+0xa4/0xb0 I was able to trigger it on other machines, when I added to the kernel command line of both "ftrace=function" and "trace_options=func_stack_trace". The cause is the "ftrace=function" would register the function tracer and create a trampoline, and it will set it as executable and read-only. Then the "trace_options=func_stack_trace" would then update the same trampoline to include the stack tracer version of the function tracer. But since the trampoline already exists, it updates it with text_poke_bp(). The problem is that text_poke_bp() called while system_state == SYSTEM_BOOTING, it will simply do a memcpy() and not the page mapping, as it would think that the text is still read-write. But in this case it is not, and we take a fault and crash. Instead, lets keep the ftrace trampolines read-write during boot up, and then when the kernel executable text is set to read-only, the ftrace trampolines get set to read-only as well. Link: https://lkml.kernel.org/r/20200430202147.4dc6e2de@oasis.local.home Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: "H. Peter Anvin" Cc: stable@vger.kernel.org Fixes: 768ae4406a5c ("x86/ftrace: Use text_poke()") Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace_internal.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0456e0a3dab1..382775edf690 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -4,28 +4,6 @@ #ifdef CONFIG_FUNCTION_TRACER -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_check() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_check() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ -#define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_check(list); \ - do - -/* - * Optimized for just a single item in the list (as that is the normal case). - */ -#define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_check((op)->next)) && \ - unlikely((op) != &ftrace_list_end)) - -extern struct ftrace_ops __rcu *ftrace_ops_list; -extern struct ftrace_ops ftrace_list_end; extern struct mutex ftrace_lock; extern struct ftrace_ops global_ops; -- cgit v1.2.3 From 303cc571d107b3641d6487061b748e70ffe15ce4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 5 May 2020 16:04:31 +0200 Subject: nsproxy: attach to namespaces via pidfds For quite a while we have been thinking about using pidfds to attach to namespaces. This patchset has existed for about a year already but we've wanted to wait to see how the general api would be received and adopted. Now that more and more programs in userspace have started using pidfds for process management it's time to send this one out. This patch makes it possible to use pidfds to attach to the namespaces of another process, i.e. they can be passed as the first argument to the setns() syscall. When only a single namespace type is specified the semantics are equivalent to passing an nsfd. That means setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However, when a pidfd is passed, multiple namespace flags can be specified in the second setns() argument and setns() will attach the caller to all the specified namespaces all at once or to none of them. Specifying 0 is not valid together with a pidfd. Here are just two obvious examples: setns(pidfd, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET); setns(pidfd, CLONE_NEWUSER); Allowing to also attach subsets of namespaces supports various use-cases where callers setns to a subset of namespaces to retain privilege, perform an action and then re-attach another subset of namespaces. If the need arises, as Eric suggested, we can extend this patchset to assume even more context than just attaching all namespaces. His suggestion specifically was about assuming the process' root directory when setns(pidfd, 0) or setns(pidfd, SETNS_PIDFD) is specified. For now, just keep it flexible in terms of supporting subsets of namespaces but let's wait until we have users asking for even more context to be assumed. At that point we can add an extension. The obvious example where this is useful is a standard container manager interacting with a running container: pushing and pulling files or directories, injecting mounts, attaching/execing any kind of process, managing network devices all these operations require attaching to all or at least multiple namespaces at the same time. Given that nowadays most containers are spawned with all namespaces enabled we're currently looking at at least 14 syscalls, 7 to open the /proc//ns/ nsfds, another 7 to actually perform the namespace switch. With time namespaces we're looking at about 16 syscalls. (We could amortize the first 7 or 8 syscalls for opening the nsfds by stashing them in each container's monitor process but that would mean we need to send around those file descriptors through unix sockets everytime we want to interact with the container or keep on-disk state. Even in scenarios where a caller wants to join a particular namespace in a particular order callers still profit from batching other namespaces. That mostly applies to the user namespace but all container runtimes I found join the user namespace first no matter if it privileges or deprivileges the container similar to how unshare behaves.) With pidfds this becomes a single syscall no matter how many namespaces are supposed to be attached to. A decently designed, large-scale container manager usually isn't the parent of any of the containers it spawns so the containers don't die when it crashes or needs to update or reinitialize. This means that for the manager to interact with containers through pids is inherently racy especially on systems where the maximum pid number is not significicantly bumped. This is even more problematic since we often spawn and manage thousands or ten-thousands of containers. Interacting with a container through a pid thus can become risky quite quickly. Especially since we allow for an administrator to enable advanced features such as syscall interception where we're performing syscalls in lieu of the container. In all of those cases we use pidfds if they are available and we pass them around as stable references. Using them to setns() to the target process' namespaces is as reliable as using nsfds. Either the target process is already dead and we get ESRCH or we manage to attach to its namespaces but we can't accidently attach to another process' namespaces. So pidfds lend themselves to be used with this api. The other main advantage is that with this change the pidfd becomes the only relevant token for most container interactions and it's the only token we need to create and send around. Apart from significiantly reducing the number of syscalls from double digit to single digit which is a decent reason post-spectre/meltdown this also allows to switch to a set of namespaces atomically, i.e. either attaching to all the specified namespaces succeeds or we fail. If we fail we haven't changed a single namespace. There are currently three namespaces that can fail (other than for ENOMEM which really is not very interesting since we then have other problems anyway) for non-trivial reasons, user, mount, and pid namespaces. We can fail to attach to a pid namespace if it is not our current active pid namespace or a descendant of it. We can fail to attach to a user namespace because we are multi-threaded or because our current mount namespace shares filesystem state with other tasks, or because we're trying to setns() to the same user namespace, i.e. the target task has the same user namespace as we do. We can fail to attach to a mount namespace because it shares filesystem state with other tasks or because we fail to lookup the new root for the new mount namespace. In most non-pathological scenarios these issues can be somewhat mitigated. But there are cases where we're half-attached to some namespace and failing to attach to another one. I've talked about some of these problem during the hallway track (something only the pre-COVID-19 generation will remember) of Plumbers in Los Angeles in 2018(?). Even if all these issues could be avoided with super careful userspace coding it would be nicer to have this done in-kernel. Pidfds seem to lend themselves nicely for this. The other neat thing about this is that setns() becomes an actual counterpart to the namespace bits of unshare(). Signed-off-by: Christian Brauner Reviewed-by: Serge Hallyn Cc: Eric W. Biederman Cc: Serge Hallyn Cc: Jann Horn Cc: Michael Kerrisk Cc: Aleksa Sarai Link: https://lore.kernel.org/r/20200505140432.181565-3-christian.brauner@ubuntu.com --- kernel/nsproxy.c | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 213 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b7954fd60475..b03df67621d0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -258,17 +259,58 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +static int check_setns_flags(unsigned long flags) +{ + if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | + CLONE_NEWCGROUP))) + return -EINVAL; + +#ifndef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + return -EINVAL; +#endif +#ifndef CONFIG_PID_NS + if (flags & CLONE_NEWPID) + return -EINVAL; +#endif +#ifndef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) + return -EINVAL; +#endif +#ifndef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + return -EINVAL; +#endif +#ifndef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) + return -EINVAL; +#endif +#ifndef CONFIG_NET_NS + if (flags & CLONE_NEWNET) + return -EINVAL; +#endif + + return 0; +} + static void put_nsset(struct nsset *nsset) { unsigned flags = nsset->flags; if (flags & CLONE_NEWUSER) put_cred(nsset_cred(nsset)); + /* + * We only created a temporary copy if we attached to more than just + * the mount namespace. + */ + if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) + free_fs_struct(nsset->fs); if (nsset->nsproxy) free_nsproxy(nsset->nsproxy); } -static int prepare_nsset(int nstype, struct nsset *nsset) +static int prepare_nsset(unsigned flags, struct nsset *nsset) { struct task_struct *me = current; @@ -276,17 +318,23 @@ static int prepare_nsset(int nstype, struct nsset *nsset) if (IS_ERR(nsset->nsproxy)) return PTR_ERR(nsset->nsproxy); - if (nstype == CLONE_NEWUSER) + if (flags & CLONE_NEWUSER) nsset->cred = prepare_creds(); else nsset->cred = current_cred(); if (!nsset->cred) goto out; - if (nstype == CLONE_NEWNS) + /* Only create a temporary copy of fs_struct if we really need to. */ + if (flags == CLONE_NEWNS) { nsset->fs = me->fs; + } else if (flags & CLONE_NEWNS) { + nsset->fs = copy_fs_struct(me->fs); + if (!nsset->fs) + goto out; + } - nsset->flags = nstype; + nsset->flags = flags; return 0; out: @@ -294,6 +342,138 @@ out: return -ENOMEM; } +static inline int validate_ns(struct nsset *nsset, struct ns_common *ns) +{ + return ns->ops->install(nsset, ns); +} + +/* + * This is the inverse operation to unshare(). + * Ordering is equivalent to the standard ordering used everywhere else + * during unshare and process creation. The switch to the new set of + * namespaces occurs at the point of no return after installation of + * all requested namespaces was successful in commit_nsset(). + */ +static int validate_nsset(struct nsset *nsset, struct pid *pid) +{ + int ret = 0; + unsigned flags = nsset->flags; + struct user_namespace *user_ns = NULL; + struct pid_namespace *pid_ns = NULL; + struct nsproxy *nsp; + struct task_struct *tsk; + + /* Take a "snapshot" of the target task's namespaces. */ + rcu_read_lock(); + tsk = pid_task(pid, PIDTYPE_PID); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + + if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + return -EPERM; + } + + task_lock(tsk); + nsp = tsk->nsproxy; + if (nsp) + get_nsproxy(nsp); + task_unlock(tsk); + if (!nsp) { + rcu_read_unlock(); + return -ESRCH; + } + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + pid_ns = task_active_pid_ns(tsk); + if (unlikely(!pid_ns)) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_pid_ns(pid_ns); + } +#endif + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + user_ns = get_user_ns(__task_cred(tsk)->user_ns); +#endif + rcu_read_unlock(); + + /* + * Install requested namespaces. The caller will have + * verified earlier that the requested namespaces are + * supported on this kernel. We don't report errors here + * if a namespace is requested that isn't supported. + */ +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + ret = validate_ns(nsset, &user_ns->ns); + if (ret) + goto out; + } +#endif + + if (flags & CLONE_NEWNS) { + ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns)); + if (ret) + goto out; + } + +#ifdef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) { + ret = validate_ns(nsset, &nsp->uts_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) { + ret = validate_ns(nsset, &nsp->ipc_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + ret = validate_ns(nsset, &pid_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) { + ret = validate_ns(nsset, &nsp->cgroup_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_NET_NS + if (flags & CLONE_NEWNET) { + ret = validate_ns(nsset, &nsp->net_ns->ns); + if (ret) + goto out; + } +#endif + +out: + if (pid_ns) + put_pid_ns(pid_ns); + if (nsp) + put_nsproxy(nsp); + put_user_ns(user_ns); + + return ret; +} + /* * This is the point of no return. There are just a few namespaces * that do some actual work here and it's sufficiently minimal that @@ -316,6 +496,12 @@ static void commit_nsset(struct nsset *nsset) } #endif + /* We only need to commit if we have used a temporary fs_struct. */ + if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) { + set_fs_root(me->fs, &nsset->fs->root); + set_fs_pwd(me->fs, &nsset->fs->pwd); + } + #ifdef CONFIG_IPC_NS if (flags & CLONE_NEWIPC) exit_sem(me); @@ -326,27 +512,38 @@ static void commit_nsset(struct nsset *nsset) nsset->nsproxy = NULL; } -SYSCALL_DEFINE2(setns, int, fd, int, nstype) +SYSCALL_DEFINE2(setns, int, fd, int, flags) { struct file *file; - struct ns_common *ns; + struct ns_common *ns = NULL; struct nsset nsset = {}; - int err; - - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); + int err = 0; - err = -EINVAL; - ns = get_proc_ns(file_inode(file)); - if (nstype && (ns->ops->type != nstype)) + file = fget(fd); + if (!file) + return -EBADF; + + if (proc_ns_file(file)) { + ns = get_proc_ns(file_inode(file)); + if (flags && (ns->ops->type != flags)) + err = -EINVAL; + flags = ns->ops->type; + } else if (!IS_ERR(pidfd_pid(file))) { + err = check_setns_flags(flags); + } else { + err = -EBADF; + } + if (err) goto out; - err = prepare_nsset(ns->ops->type, &nsset); + err = prepare_nsset(flags, &nsset); if (err) goto out; - err = ns->ops->install(&nsset, ns); + if (proc_ns_file(file)) + err = validate_ns(&nsset, ns); + else + err = validate_nsset(&nsset, file->private_data); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); -- cgit v1.2.3 From c9d64a1b2d0b9c81aef5e907bbeff549fd844d13 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:33 -0700 Subject: rcuwait: Fix stale wake call name in comment The 'trywake' name was renamed to simply 'wake', update the comment. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-2-dave@stgolabs.net> Signed-off-by: Paolo Bonzini --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 389a88cb3081..9f9015f3f6b0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -236,7 +236,7 @@ void rcuwait_wake_up(struct rcuwait *w) /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called - * rcuwait_trywake() in the first place. Pairs with set_current_state() + * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE -- cgit v1.2.3 From 9d9a6ebfea329cadeda87544dceae01e9c27aaef Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:34 -0700 Subject: rcuwait: Let rcuwait_wake_up() return whether or not a task was awoken Propagating the return value of wake_up_process() back to the caller can come in handy for future users, such as for statistics or accounting purposes. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-3-dave@stgolabs.net> Signed-off-by: Paolo Bonzini --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9f9015f3f6b0..f3beb637acf7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -227,8 +227,9 @@ repeat: goto repeat; } -void rcuwait_wake_up(struct rcuwait *w) +int rcuwait_wake_up(struct rcuwait *w) { + int ret = 0; struct task_struct *task; rcu_read_lock(); @@ -248,8 +249,10 @@ void rcuwait_wake_up(struct rcuwait *w) task = rcu_dereference(w->task); if (task) - wake_up_process(task); + ret = wake_up_process(task); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); -- cgit v1.2.3 From 2e3ed68bfcd9c5ca2cf8b88ba23a34992ccd0b1f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:18 -0700 Subject: bpf: Add comments to interpret bpf_prog return values Add a short comment in bpf_iter_run_prog() function to explain how bpf_prog return value is converted to seq_ops->show() return value: bpf_prog return seq_ops()->show() return 0 0 1 -EAGAIN When show() return value is -EAGAIN, the current bpf_seq_read() will end. If the current seq_file buffer is empty, -EAGAIN will return to user space. Otherwise, the buffer will be copied to user space. In both cases, the next bpf_seq_read() call will try to show the same object which returned -EAGAIN previously. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180218.2949517-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 30efd15cd4a0..0a45a6cdfabd 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -526,5 +526,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) migrate_enable(); rcu_read_unlock(); + /* bpf program can only return 0 or 1: + * 0 : okay + * 1 : retry the same object + * The bpf_iter_run_prog() return value + * will be seq_ops->show() return value. + */ return ret == 0 ? 0 : -EAGAIN; } -- cgit v1.2.3 From 15172a46fa2796c1a1358a36babd31274716ed41 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:19 -0700 Subject: bpf: net: Refactor bpf_iter target registration Currently bpf_iter_reg_target takes parameters from target and allocates memory to save them. This is really not necessary, esp. in the future we may grow information passed from targets to bpf_iter manager. The patch refactors the code so target reg_info becomes static and bpf_iter manager can just take a reference to it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200513180219.2949605-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 36 +++++++++++++++++------------------- kernel/bpf/map_iter.c | 18 +++++++++--------- kernel/bpf/task_iter.c | 30 ++++++++++++++++-------------- 3 files changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0a45a6cdfabd..051fb8cab62a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -8,11 +8,7 @@ struct bpf_iter_target_info { struct list_head list; - const char *target; - const struct seq_operations *seq_ops; - bpf_iter_init_seq_priv_t init_seq_private; - bpf_iter_fini_seq_priv_t fini_seq_private; - u32 seq_priv_size; + const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; @@ -222,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->fini_seq_private) - iter_priv->tinfo->fini_seq_private(seq->private); + if (iter_priv->tinfo->reg_info->fini_seq_private) + iter_priv->tinfo->reg_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -238,7 +234,12 @@ const struct file_operations bpf_iter_fops = { .release = iter_release, }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +/* The argument reg_info will be cached in bpf_iter_target_info. + * The common practice is to declare target reg_info as + * a const static variable and passed as an argument to + * bpf_iter_reg_target(). + */ +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -246,11 +247,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) if (!tinfo) return -ENOMEM; - tinfo->target = reg_info->target; - tinfo->seq_ops = reg_info->seq_ops; - tinfo->init_seq_private = reg_info->init_seq_private; - tinfo->fini_seq_private = reg_info->fini_seq_private; - tinfo->seq_priv_size = reg_info->seq_priv_size; + tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); @@ -267,7 +264,7 @@ void bpf_iter_unreg_target(const char *target) mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->target)) { + if (!strcmp(target, tinfo->reg_info->target)) { list_del(&tinfo->list); kfree(tinfo); found = true; @@ -303,7 +300,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) supported = true; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { cache_btf_id(tinfo, prog); supported = true; break; @@ -431,15 +428,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + tinfo->reg_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->init_seq_private) { - err = tinfo->init_seq_private(priv_data->target_private); + if (tinfo->reg_info->init_seq_private) { + err = tinfo->reg_info->init_seq_private(priv_data->target_private); if (err) goto release_seq_file; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8162e0c00b9f..c6216a5fe56e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -81,17 +81,17 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; +static const struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + static int __init bpf_map_iter_init(void) { - struct bpf_iter_reg reg_info = { - .target = "bpf_map", - .seq_ops = &bpf_map_seq_ops, - .init_seq_private = NULL, - .fini_seq_private = NULL, - .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&bpf_map_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index aeed662d8451..bd7bfd83d9e0 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -306,22 +306,24 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +static const struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static const struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + static int __init task_iter_init(void) { - struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", - .seq_ops = &task_file_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), - }; - struct bpf_iter_reg task_reg_info = { - .target = "task", - .seq_ops = &task_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), - }; int ret; ret = bpf_iter_reg_target(&task_reg_info); -- cgit v1.2.3 From ab2ee4fcb9d61fd57db70db694adbcf54662bd80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:20 -0700 Subject: bpf: Change func bpf_iter_unreg_target() signature Change func bpf_iter_unreg_target() parameter from target name to target reg_info, similar to bpf_iter_reg_target(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180220.2949737-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 051fb8cab62a..644f8626b2c0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -257,14 +257,14 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) return 0; } -void bpf_iter_unreg_target(const char *target) +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->reg_info->target)) { + if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; -- cgit v1.2.3 From 3c32cc1bceba8a1755dc35cd97516f6c67856844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:21 -0700 Subject: bpf: Enable bpf_iter targets registering ctx argument types Commit b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") adds a field btf_id_or_null_non0_off to bpf_prog->aux structure to indicate that the first ctx argument is PTR_TO_BTF_ID reg_type and all others are PTR_TO_BTF_ID_OR_NULL. This approach does not really scale if we have other different reg types in the future, e.g., a pointer to a buffer. This patch enables bpf_iter targets registering ctx argument reg types which may be different from the default one. For example, for pointers to structures, the default reg_type is PTR_TO_BTF_ID for tracing program. The target can register a particular pointer type as PTR_TO_BTF_ID_OR_NULL which can be used by the verifier to enforce accesses. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180221.2949882-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 5 +++++ kernel/bpf/btf.c | 15 ++++++++++----- kernel/bpf/map_iter.c | 5 +++++ kernel/bpf/task_iter.c | 12 ++++++++++++ kernel/bpf/verifier.c | 1 - 5 files changed, 32 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 644f8626b2c0..dd612b80b9fe 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -308,6 +308,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); + if (supported) { + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; + } + return supported; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcd233139294..58c9af1d4808 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; - int ret; + int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3790,10 +3790,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - if (off != 0 && prog->aux->btf_id_or_null_non0_off) - info->reg_type = PTR_TO_BTF_ID_OR_NULL; - else - info->reg_type = PTR_TO_BTF_ID; + info->reg_type = PTR_TO_BTF_ID; + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off) { + info->reg_type = ctx_arg_info->reg_type; + break; + } + } if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c6216a5fe56e..c69071e334bf 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -87,6 +87,11 @@ static const struct bpf_iter_reg bpf_map_reg_info = { .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map, map), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_map_iter_init(void) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index bd7bfd83d9e0..a9b7264dda08 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -312,6 +312,11 @@ static const struct bpf_iter_reg task_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task, task), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static const struct bpf_iter_reg task_file_reg_info = { @@ -320,6 +325,13 @@ static const struct bpf_iter_reg task_file_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_file, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_file, file), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init task_iter_init(void) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a1826c76bb6..a3f2af756fd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10652,7 +10652,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; - prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; -- cgit v1.2.3 From 3d2353de81061cab4b9d68b3e1dc69cbec1451ea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 13 May 2020 15:18:01 -0400 Subject: ring-buffer: Don't deactivate the ring buffer on failed iterator reads If the function tracer is running and the trace file is read (which uses the ring buffer iterator), the iterator can get in sync with the writes, and caues it to fail to find a page with content it can read three times. This causes a warning and deactivation of the ring buffer code. Looking at the other cases of failure to get an event, it appears that there's a chance that the writer could cause them too. Since the iterator is a "best effort" to read the ring buffer if there's an active writer (the consumer reader is made for this case "see trace_pipe"), if it fails to get an event after three tries, simply give up and return NULL. Don't warn, nor disable the ring buffer on this failure. Link: https://lore.kernel.org/r/20200429090508.GG5770@shao2-debian Reported-by: kernel test robot Fixes: ff84c50cfb4b ("ring-buffer: Do not die if rb_iter_peek() fails more than thrice") Tested-by: Sven Schnelle Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6f0b42ceeb00..448d5f528764 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4034,7 +4034,6 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; - bool failed = false; if (ts) *ts = 0; @@ -4056,19 +4055,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a time extend is encountered or we hit - * the end of the page. Since the time extend is always attached - * to a data event, we should never loop more than three times. - * Once for going to next page, once on time extend, and - * finally once to get the event. - * We should never hit the following condition more than thrice, - * unless the buffer is very small, and there's a writer - * that is causing the reader to fail getting an event. + * As the writer can mess with what the iterator is trying + * to read, just give up if we fail to get an event after + * three tries. The iterator is not as reliable when reading + * the ring buffer with an active write as the consumer is. + * Do not warn if the three failures is reached. */ - if (++nr_loops > 3) { - RB_WARN_ON(cpu_buffer, !failed); + if (++nr_loops > 3) return NULL; - } if (rb_per_cpu_empty(cpu_buffer)) return NULL; @@ -4079,10 +4073,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } event = rb_iter_head_event(iter); - if (!event) { - failed = true; + if (!event) goto again; - } switch (event->type_len) { case RINGBUF_TYPE_PADDING: -- cgit v1.2.3 From da4d401a6b8fda7414033f81982f64ade02c0e27 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 13 May 2020 15:36:22 -0400 Subject: ring-buffer: Remove all BUG() calls There's a lot of checks to make sure the ring buffer is working, and if an anomaly is detected, it safely shuts itself down. But there's a few cases that it will call BUG(), which defeats the point of being safe (it crashes the kernel when an anomaly is found!). There's no reason for them. Switch them all to either WARN_ON_ONCE() (when no ring buffer descriptor is present), or to RB_WARN_ON() (when a ring buffer descriptor is present). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 448d5f528764..b8e1ca48be50 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -193,7 +193,7 @@ rb_event_length(struct ring_buffer_event *event) case RINGBUF_TYPE_DATA: return rb_event_data_length(event); default: - BUG(); + WARN_ON_ONCE(1); } /* not hit */ return 0; @@ -249,7 +249,7 @@ rb_event_data(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); - BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) return (void *)&event->array[0]; @@ -3727,7 +3727,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; default: - BUG(); + RB_WARN_ON(cpu_buffer, 1); } return; } @@ -3757,7 +3757,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; default: - BUG(); + RB_WARN_ON(iter->cpu_buffer, 1); } return; } @@ -4020,7 +4020,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, return event; default: - BUG(); + RB_WARN_ON(cpu_buffer, 1); } return NULL; @@ -4109,7 +4109,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return event; default: - BUG(); + RB_WARN_ON(cpu_buffer, 1); } return NULL; -- cgit v1.2.3 From 333291ce5055f2039afc907badaf5b66bc1adfdc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 12 May 2020 16:59:25 -0700 Subject: bpf: Fix bug in mmap() implementation for BPF array map mmap() subsystem allows user-space application to memory-map region with initial page offset. This wasn't taken into account in initial implementation of BPF array memory-mapping. This would result in wrong pages, not taking into account requested page shift, being memory-mmaped into user-space. This patch fixes this gap and adds a test for such scenario. Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200512235925.3817805-1-andriin@fb.com --- kernel/bpf/arraymap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d6120fd5ba6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -486,7 +486,12 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) if (!(map->map_flags & BPF_F_MMAPABLE)) return -EINVAL; - return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > + PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), + vma->vm_pgoff + pgoff); } const struct bpf_map_ops array_map_ops = { -- cgit v1.2.3 From e92888c72fbdc6f9d07b3b0604c012e81d7c0da7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 22:32:05 -0700 Subject: bpf: Enforce returning 0 for fentry/fexit progs Currently, tracing/fentry and tracing/fexit prog return values are not enforced. In trampoline codes, the fentry/fexit prog return values are ignored. Let us enforce it to be 0 to avoid confusion and allows potential future extension. This patch also explicitly added return value checking for tracing/raw_tp, tracing/fmod_ret, and freplace programs such that these program return values can be anything. The purpose are two folds: 1. to make it explicit about return value expectations for these programs in verifier. 2. for tracing prog_type, if a future attach type is added, the default is -ENOTSUPP which will enforce to specify return value ranges explicitly. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200514053206.1298415-1-yhs@fb.com --- kernel/bpf/verifier.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fa1d8245b925..a44ba6672688 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7059,6 +7059,23 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + range = tnum_const(0); + break; + case BPF_TRACE_RAW_TP: + case BPF_MODIFY_RETURN: + return 0; + default: + return -ENOTSUPP; + } + break; + case BPF_PROG_TYPE_EXT: + /* freplace program can return anything as its return value + * depends on the to-be-replaced kernel func or bpf program. + */ default: return 0; } -- cgit v1.2.3 From c70f34a8ac66c2cb05593ef5760142e5f862a9b4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 May 2020 22:51:37 -0700 Subject: bpf: Fix bpf_iter's task iterator logic task_seq_get_next might stop prematurely if get_pid_task() fails to get task_struct. Failure to do so doesn't mean that there are no more tasks with higher pids. Procfs's iteration algorithm (see next_tgid in fs/proc/base.c) does a retry in such case. After this fix, instead of stopping prematurely after about 300 tasks on my server, bpf_iter program now returns >4000, which sounds much closer to reality. Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200514055137.1564581-1-andriin@fb.com --- kernel/bpf/task_iter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index a9b7264dda08..4dbf2b6035f8 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -27,9 +27,15 @@ static struct task_struct *task_seq_get_next(struct pid_namespace *ns, struct pid *pid; rcu_read_lock(); +retry: pid = idr_get_next(&ns->idr, tid); - if (pid) + if (pid) { task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ++*tid; + goto retry; + } + } rcu_read_unlock(); return task; -- cgit v1.2.3 From db612f749e2454c506f20155bba2871f0307d133 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:38 +0200 Subject: xdp: Cpumap redirect use frame_sz and increase skb_tailroom MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Knowing the memory size backing the packet/xdp_frame data area, and knowing it already have reserved room for skb_shared_info, simplifies using build_skb significantly. With this change we no-longer lie about the SKB truesize, but more importantly a significant larger skb_tailroom is now provided, e.g. when drivers uses a full PAGE_SIZE. This extra tailroom (in linear area) can be used by the network stack when coalescing SKBs (e.g. in skb_try_coalesce, see TCP cases where tcp_queue_rcv() can 'eat' skb). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945337822.97035.13557959180460986059.stgit@firesoul --- kernel/bpf/cpumap.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3fe0b006d2d2..a71790dab12d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - /* build_skb need to place skb_shared_info after SKB end, and - * also want to know the memory "truesize". Thus, need to - * know the memory frame size backing xdp_buff. - * - * XDP was designed to have PAGE_SIZE frames, but this - * assumption is not longer true with ixgbe and i40e. It - * would be preferred to set frame_size to 2048 or 4096 - * depending on the driver. - * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_frame); - * - * Instead, with info avail, skb_shared_info in placed after - * packet len. This, unfortunately fakes the truesize. - * Another disadvantage of this approach, the skb_shared_info - * is not at a fixed memory location, with mixed length - * packets, which is bad for cache-line hotness. + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_size = xdpf->frame_sz; pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb_around(skb, pkt_data_start, frame_size); -- cgit v1.2.3 From c69b470eb85798514723ffa2686da6d21198c0d0 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:49 +0100 Subject: kdb: constify sysrq_key_op With earlier commits, the API no longer discards the const-ness of the sysrq_key_op. As such we can add the notation. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Cc: Jason Wessel Cc: Daniel Thompson Cc: kgdb-bugreport@lists.sourceforge.net Acked-by: Daniel Thompson Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-9-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/debug/debug_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2b7c9b67931d..355bea54ca0e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -920,7 +920,7 @@ static void sysrq_handle_dbg(int key) kgdb_breakpoint(); } -static struct sysrq_key_op sysrq_dbg_op = { +static const struct sysrq_key_op sysrq_dbg_op = { .handler = sysrq_handle_dbg, .help_msg = "debug(g)", .action_msg = "DEBUG", -- cgit v1.2.3 From 6400b5a0f604298a03748b96693a77d12b479998 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:50 +0100 Subject: kernel/power: constify sysrq_key_op With earlier commits, the API no longer discards the const-ness of the sysrq_key_op. As such we can add the notation. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: linux-pm@vger.kernel.org Acked-by: Rafael J. Wysocki Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-10-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/power/poweroff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 6d475281c730..562aa0e450ed 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -29,7 +29,7 @@ static void handle_poweroff(int key) schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } -static struct sysrq_key_op sysrq_poweroff_op = { +static const struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, .help_msg = "poweroff(o)", .action_msg = "Power Off", -- cgit v1.2.3 From 0ca650c430404708a6e8bea75e5f92ce8448f098 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:51 +0100 Subject: rcu: constify sysrq_key_op With earlier commits, the API no longer discards the const-ness of the sysrq_key_op. As such we can add the notation. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: rcu@vger.kernel.org Reviewed-by: Paul E. McKenney Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-11-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/rcu/tree_stall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..4e6ed7b91f59 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -729,7 +729,7 @@ static void sysrq_show_rcu(int key) show_rcu_gp_kthreads(); } -static struct sysrq_key_op sysrq_rcudump_op = { +static const struct sysrq_key_op sysrq_rcudump_op = { .handler = sysrq_show_rcu, .help_msg = "show-rcu(y)", .action_msg = "Show RCU tree", -- cgit v1.2.3 From 0ebeea8ca8a4d1d453ad299aef0507dab04f6e8d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 May 2020 12:11:16 +0200 Subject: bpf: Restrict bpf_probe_read{, str}() only to archs where they work Given the legacy bpf_probe_read{,str}() BPF helpers are broken on archs with overlapping address ranges, we should really take the next step to disable them from BPF use there. To generally fix the situation, we've recently added new helper variants bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str(). For details on them, see 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel} and probe_read_{user,kernel}_str helpers"). Given bpf_probe_read{,str}() have been around for ~5 years by now, there are plenty of users at least on x86 still relying on them today, so we cannot remove them entirely w/o breaking the BPF tracing ecosystem. However, their use should be restricted to archs with non-overlapping address ranges where they are working in their current form. Therefore, move this behind a CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE and have x86, arm64, arm select it (other archs supporting it can follow-up on it as well). For the remaining archs, they can workaround easily by relying on the feature probe from bpftool which spills out defines that can be used out of BPF C code to implement the drop-in replacement for old/new kernels via: bpftool feature probe macro Suggested-by: Linus Torvalds Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Acked-by: Linus Torvalds Cc: Brendan Gregg Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200515101118.6508-2-daniel@iogearbox.net --- kernel/trace/bpf_trace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..b83bdaa31c7b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -825,14 +825,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: return &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read: - return &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return &bpf_probe_read_kernel_str_proto; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + case BPF_FUNC_probe_read: + return &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_compat_str_proto; +#endif #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; -- cgit v1.2.3 From 47cc0ed574abcbbde0cf143ddb21a0baed1aa2df Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 May 2020 12:11:17 +0200 Subject: bpf: Add bpf_probe_read_{user, kernel}_str() to do_refine_retval_range Given bpf_probe_read{,str}() BPF helpers are now only available under CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE, we need to add the drop-in replacements of bpf_probe_read_{kernel,user}_str() to do_refine_retval_range() as well to avoid hitting the same issue as in 849fa50662fbc ("bpf/verifier: refine retval R0 state for bpf_get_stack helper"). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200515101118.6508-3-daniel@iogearbox.net --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a44ba6672688..8d7ee40e2748 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4340,7 +4340,9 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, if (ret_type != RET_INTEGER || (func_id != BPF_FUNC_get_stack && - func_id != BPF_FUNC_probe_read_str)) + func_id != BPF_FUNC_probe_read_str && + func_id != BPF_FUNC_probe_read_kernel_str && + func_id != BPF_FUNC_probe_read_user_str)) return; ret_reg->smax_value = meta->msize_max_value; -- cgit v1.2.3 From b2a5212fb634561bb734c6356904e37f6665b955 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 May 2020 12:11:18 +0200 Subject: bpf: Restrict bpf_trace_printk()'s %s usage and add %pks, %pus specifier Usage of plain %s conversion specifier in bpf_trace_printk() suffers from the very same issue as bpf_probe_read{,str}() helpers, that is, it is broken on archs with overlapping address ranges. While the helpers have been addressed through work in 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers"), we need an option for bpf_trace_printk() as well to fix it. Similarly as with the helpers, force users to make an explicit choice by adding %pks and %pus specifier to bpf_trace_printk() which will then pick the corresponding strncpy_from_unsafe*() variant to perform the access under KERNEL_DS or USER_DS. The %pk* (kernel specifier) and %pu* (user specifier) can later also be extended for other objects aside strings that are probed and printed under tracing, and reused out of other facilities like bpf_seq_printf() or BTF based type printing. Existing behavior of %s for current users is still kept working for archs where it is not broken and therefore gated through CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE. For archs not having this property we fall-back to pick probing under KERNEL_DS as a sensible default. Fixes: 8d3b7dce8622 ("bpf: add support for %s specifier to bpf_trace_printk()") Reported-by: Linus Torvalds Reported-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Masami Hiramatsu Cc: Brendan Gregg Link: https://lore.kernel.org/bpf/20200515101118.6508-4-daniel@iogearbox.net --- kernel/trace/bpf_trace.c | 94 +++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b83bdaa31c7b..a010edc37ee0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -323,17 +323,15 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) /* * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) { + int i, mod[3] = {}, fmt_cnt = 0; + char buf[64], fmt_ptype; + void *unsafe_ptr = NULL; bool str_seen = false; - int mod[3] = {}; - int fmt_cnt = 0; - u64 unsafe_addr; - char buf[64]; - int i; /* * bpf_check()->check_func_arg()->check_stack_boundary() @@ -359,40 +357,71 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, if (fmt[i] == 'l') { mod[fmt_cnt]++; i++; - } else if (fmt[i] == 'p' || fmt[i] == 's') { + } else if (fmt[i] == 'p') { mod[fmt_cnt]++; + if ((fmt[i + 1] == 'k' || + fmt[i + 1] == 'u') && + fmt[i + 2] == 's') { + fmt_ptype = fmt[i + 1]; + i += 2; + goto fmt_str; + } + /* disallow any further format extensions */ if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) return -EINVAL; - fmt_cnt++; - if (fmt[i] == 's') { - if (str_seen) - /* allow only one '%s' per fmt string */ - return -EINVAL; - str_seen = true; - - switch (fmt_cnt) { - case 1: - unsafe_addr = arg1; - arg1 = (long) buf; - break; - case 2: - unsafe_addr = arg2; - arg2 = (long) buf; - break; - case 3: - unsafe_addr = arg3; - arg3 = (long) buf; - break; - } - buf[0] = 0; - strncpy_from_unsafe(buf, - (void *) (long) unsafe_addr, + + goto fmt_next; + } else if (fmt[i] == 's') { + mod[fmt_cnt]++; + fmt_ptype = fmt[i]; +fmt_str: + if (str_seen) + /* allow only one '%s' per fmt string */ + return -EINVAL; + str_seen = true; + + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) + return -EINVAL; + + switch (fmt_cnt) { + case 0: + unsafe_ptr = (void *)(long)arg1; + arg1 = (long)buf; + break; + case 1: + unsafe_ptr = (void *)(long)arg2; + arg2 = (long)buf; + break; + case 2: + unsafe_ptr = (void *)(long)arg3; + arg3 = (long)buf; + break; + } + + buf[0] = 0; + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + strncpy_from_unsafe(buf, unsafe_ptr, sizeof(buf)); + break; +#endif + case 'k': + strncpy_from_unsafe_strict(buf, unsafe_ptr, + sizeof(buf)); + break; + case 'u': + strncpy_from_unsafe_user(buf, + (__force void __user *)unsafe_ptr, + sizeof(buf)); + break; } - continue; + goto fmt_next; } if (fmt[i] == 'l') { @@ -403,6 +432,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') return -EINVAL; +fmt_next: fmt_cnt++; } -- cgit v1.2.3 From 2c78ee898d8f10ae6fb2fa23a3fbaec96b1b7366 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:54 -0700 Subject: bpf: Implement CAP_BPF Implement permissions as stated in uapi/linux/capability.h In order to do that the verifier allow_ptr_leaks flag is split into four flags and they are set as: env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); The first three currently equivalent to perfmon_capable(), since leaking kernel pointers and reading kernel memory via side channel attacks is roughly equivalent to reading kernel memory with cap_perfmon. 'bpf_capable' enables bounded loops, precision tracking, bpf to bpf calls and other verifier features. 'allow_ptr_leaks' enable ptr leaks, ptr conversions, subtraction of pointers. 'bypass_spec_v1' disables speculative analysis in the verifier, run time mitigations in bpf array, and enables indirect variable access in bpf programs. 'bypass_spec_v4' disables emission of sanitation code by the verifier. That means that the networking BPF program loaded with CAP_BPF + CAP_NET_ADMIN will have speculative checks done by the verifier and other spectre mitigation applied. Such networking BPF program will not be able to leak kernel pointers and will not be able to access arbitrary kernel memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-3-alexei.starovoitov@gmail.com --- kernel/bpf/arraymap.c | 10 ++--- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/hashtab.c | 4 +- kernel/bpf/helpers.c | 4 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/syscall.c | 89 +++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 37 +++++++++--------- kernel/trace/bpf_trace.c | 3 ++ 14 files changed, 109 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d5bb0d983b2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa11de67315..c40ff4cf9880 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a71790dab12d..8b85bfddfac7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5c0290e0696e..886949fdcece 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 30e1373fd437..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de2a75500233..79bcd8d056d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return skb_flow_dissector_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3f2af756fd6..180933f6fba9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8353,7 +8353,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d961428fb5b6..9a84d7fb4869 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); -- cgit v1.2.3 From d08b9f0ca6605e13dcb48f04e55a30545b3c71eb Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:07 -0700 Subject: scs: Add support for Clang's Shadow Call Stack (SCS) This change adds generic support for Clang's Shadow Call Stack, which uses a shadow stack to protect return addresses from being overwritten by an attacker. Details are available here: https://clang.llvm.org/docs/ShadowCallStack.html Note that security guarantees in the kernel differ from the ones documented for user space. The kernel must store addresses of shadow stacks in memory, which means an attacker capable reading and writing arbitrary memory may be able to locate them and hijack control flow by modifying the stacks. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Reviewed-by: Miguel Ojeda [will: Numerous cosmetic changes] Signed-off-by: Will Deacon --- kernel/Makefile | 1 + kernel/fork.c | 9 ++++++++ kernel/sched/core.c | 2 ++ kernel/scs.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+) create mode 100644 kernel/scs.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..c332eb9d4841 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..f6339f9d232d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include @@ -456,6 +457,8 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + scs_release(tsk); + #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -840,6 +843,8 @@ void __init fork_init(void) NULL, free_vm_stack_cache); #endif + scs_init(); + lockdep_init_task(&init_task); uprobes_init(); } @@ -899,6 +904,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_stack; + err = scs_prepare(tsk, node); + if (err) + goto free_stack; + #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..934e03cfaec7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -6040,6 +6041,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; + scs_task_reset(idle); kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP diff --git a/kernel/scs.c b/kernel/scs.c new file mode 100644 index 000000000000..38f8f31c9451 --- /dev/null +++ b/kernel/scs.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include +#include +#include +#include + +static struct kmem_cache *scs_cache; + +static void *scs_alloc(int node) +{ + void *s; + + s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + if (s) { + *__scs_magic(s) = SCS_END_MAGIC; + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + } + + return s; +} + +static void scs_free(void *s) +{ + kasan_unpoison_object_data(scs_cache, s); + kmem_cache_free(scs_cache, s); +} + +void __init scs_init(void) +{ + scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); +} + +int scs_prepare(struct task_struct *tsk, int node) +{ + void *s = scs_alloc(node); + + if (!s) + return -ENOMEM; + + task_scs(tsk) = s; + task_scs_offset(tsk) = 0; + + return 0; +} + +void scs_release(struct task_struct *tsk) +{ + void *s = task_scs(tsk); + + if (!s) + return; + + WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_free(s); +} -- cgit v1.2.3 From 628d06a48f57c36abdc2a024930212e654a501b7 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:08 -0700 Subject: scs: Add page accounting for shadow call stack allocations This change adds accounting for the memory allocated for shadow stacks. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Acked-by: Will Deacon Signed-off-by: Will Deacon --- kernel/scs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 38f8f31c9451..6d2f983ac54e 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -6,8 +6,10 @@ */ #include +#include #include #include +#include #include static struct kmem_cache *scs_cache; @@ -40,6 +42,17 @@ void __init scs_init(void) scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); } +static struct page *__scs_page(struct task_struct *tsk) +{ + return virt_to_page(task_scs(tsk)); +} + +static void scs_account(struct task_struct *tsk, int account) +{ + mod_zone_page_state(page_zone(__scs_page(tsk)), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / 1024)); +} + int scs_prepare(struct task_struct *tsk, int node) { void *s = scs_alloc(node); @@ -49,6 +62,7 @@ int scs_prepare(struct task_struct *tsk, int node) task_scs(tsk) = s; task_scs_offset(tsk) = 0; + scs_account(tsk, 1); return 0; } @@ -61,5 +75,6 @@ void scs_release(struct task_struct *tsk) return; WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_account(tsk, -1); scs_free(s); } -- cgit v1.2.3 From 5bbaf9d1fcb9be696ee9a61636ab6803556c70f2 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:09 -0700 Subject: scs: Add support for stack usage debugging Implements CONFIG_DEBUG_STACK_USAGE for shadow stacks. When enabled, also prints out the highest shadow stack usage per process. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Acked-by: Will Deacon [will: rewrote most of scs_check_usage()] Signed-off-by: Will Deacon --- kernel/scs.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 6d2f983ac54e..9389c28f0853 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -63,10 +63,37 @@ int scs_prepare(struct task_struct *tsk, int node) task_scs(tsk) = s; task_scs_offset(tsk) = 0; scs_account(tsk, 1); - return 0; } +static void scs_check_usage(struct task_struct *tsk) +{ + static unsigned long highest; + + unsigned long *p, prev, curr = highest, used = 0; + + if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) + return; + + for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + if (!READ_ONCE_NOCHECK(*p)) + break; + used++; + } + + while (used > curr) { + prev = cmpxchg_relaxed(&highest, curr, used); + + if (prev == curr) { + pr_info("%s (%d): highest shadow stack usage: %lu bytes\n", + tsk->comm, task_pid_nr(tsk), used); + break; + } + + curr = prev; + } +} + void scs_release(struct task_struct *tsk) { void *s = task_scs(tsk); @@ -75,6 +102,7 @@ void scs_release(struct task_struct *tsk) return; WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_check_usage(tsk); scs_account(tsk, -1); scs_free(s); } -- cgit v1.2.3 From 2f4c33063ad713e3a5b63002cf8362846e78bd71 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 15 May 2020 18:02:22 +0200 Subject: docs: sysctl/kernel: document ngroups_max This is a read-only export of NGROUPS_MAX, so this patch also changes the declarations in kernel/sysctl.c to const. Signed-off-by: Stephen Kitt Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20200515160222.7994-1-steve@sk2.org Signed-off-by: Jonathan Corbet --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..2ba9f449d273 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -146,7 +146,7 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static int maxolduid = 65535; static int minolduid; -static int ngroups_max = NGROUPS_MAX; +static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; /* @@ -883,7 +883,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "ngroups_max", - .data = &ngroups_max, + .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, -- cgit v1.2.3 From b5945214b76a1f22929481724ffd448000ede914 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 4 May 2020 10:50:17 -0700 Subject: kernel/cpu_pm: Fix uninitted local in cpu_pm cpu_pm_notify() is basically a wrapper of notifier_call_chain(). notifier_call_chain() doesn't initialize *nr_calls to 0 before it starts incrementing it--presumably it's up to the callers to do this. Unfortunately the callers of cpu_pm_notify() don't init *nr_calls. This potentially means you could get too many or two few calls to CPU_PM_ENTER_FAILED or CPU_CLUSTER_PM_ENTER_FAILED depending on the luck of the stack. Let's fix this. Fixes: ab10023e0088 ("cpu_pm: Add cpu power management notifiers") Cc: stable@vger.kernel.org Cc: Rafael J. Wysocki Reviewed-by: Stephen Boyd Reviewed-by: Greg Kroah-Hartman Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200504104917.v6.3.I2d44fc0053d019f239527a4e5829416714b7e299@changeid Signed-off-by: Bjorn Andersson --- kernel/cpu_pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index cbca6879ab7d..44a259338e33 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); */ int cpu_pm_enter(void) { - int nr_calls; + int nr_calls = 0; int ret = 0; ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); */ int cpu_cluster_pm_enter(void) { - int nr_calls; + int nr_calls = 0; int ret = 0; ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); -- cgit v1.2.3 From 2ec0616e870f0f2aa8353e0de057f0c2dc8d52d5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 16 May 2020 00:39:18 +0200 Subject: bpf: Fix check_return_code to only allow [0,1] in trace_iter progs As per 15d83c4d7cef ("bpf: Allow loading of a bpf_iter program") we only allow a range of [0,1] for return codes. Therefore BPF_TRACE_ITER relies on the default tnum_range(0, 1) which is set in range var. On recent merge of net into net-next commit e92888c72fbd ("bpf: Enforce returning 0 for fentry/fexit progs") got pulled in and caused a merge conflict with the changes from 15d83c4d7cef. The resolution had a snall hiccup in that it removed the [0,1] range restriction again so that BPF_TRACE_ITER would have no enforcement. Fix it by adding it back. Fixes: da07f52d3caf ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25b14ee0e26d..9c7d67d65d8c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7120,10 +7120,11 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_TRACE_FEXIT: range = tnum_const(0); break; - case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: return 0; + case BPF_TRACE_ITER: + break; default: return -ENOTSUPP; } -- cgit v1.2.3 From 870c153cf0e6df1b8b5226af41b19945e8e0d143 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 13 May 2020 18:02:23 +0200 Subject: blktrace: Report pid with note messages Currently informational messages within block trace do not have PID information of the process reporting the message included. With BFQ it is sometimes useful to have the information and there's no good reason to omit the information from the trace. So just fill in pid information when generating note message. Signed-off-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Acked-by: Paolo Valente Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ca39dc3230cb..ea47f2084087 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -170,10 +170,10 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } -- cgit v1.2.3 From 5c8f77a278737a6af44a892f0700d9aadb2b0de0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 14 May 2020 10:39:00 +0200 Subject: irqdomain: Make irq_domain_reset_irq_data() available to non-hierarchical users irq_domain_reset_irq_data() doesn't modify the parent data, so it can be made available even if irq domain hierarchy is not being built. We'll subsequently use it in irq_sim code. Signed-off-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20200514083901.23445-2-brgl@bgdev.pl --- kernel/irq/irqdomain.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 35b8d97c3a1d..e2aa128ea3ee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1047,6 +1047,18 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, return virq; } +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); + #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy @@ -1247,18 +1259,6 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, } EXPORT_SYMBOL(irq_domain_set_info); -/** - * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data - * @irq_data: The pointer to irq_data - */ -void irq_domain_reset_irq_data(struct irq_data *irq_data) -{ - irq_data->hwirq = 0; - irq_data->chip = &no_irq_chip; - irq_data->chip_data = NULL; -} -EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); - /** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match -- cgit v1.2.3 From 337cbeb2c13eb4cab84f576fd402d7ae4ed31ae1 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 14 May 2020 10:39:01 +0200 Subject: genirq/irq_sim: Simplify the API The interrupt simulator API exposes a lot of custom data structures and functions and doesn't reuse the interfaces already exposed by the irq subsystem. This patch tries to address it. We hide all the simulator-related data structures from users and instead rely on the well-known irq domain. When creating the interrupt simulator the user receives a pointer to a newly created irq_domain and can use it to create mappings for simulated interrupts. It is also possible to pass a handle to fwnode when creating the simulator domain and retrieve it using irq_find_matching_fwnode(). The irq_sim_fire() function is dropped as well. Instead we implement the irq_get/set_irqchip_state interface. We modify the two modules that use the simulator at the same time as adding these changes in order to reduce the intermediate bloat that would result when trying to migrate the drivers in separate patches. Signed-off-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Acked-by: Jonathan Cameron #for IIO Link: https://lore.kernel.org/r/20200514083901.23445-3-brgl@bgdev.pl --- kernel/irq/Kconfig | 1 + kernel/irq/irq_sim.c | 267 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 166 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20d501af4f2e..d63c324895ea 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -72,6 +72,7 @@ config IRQ_DOMAIN config IRQ_SIM bool select IRQ_WORK + select IRQ_DOMAIN # Support for hierarchical irq domains config IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index b992f88c5613..48006608baf0 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,14 +1,31 @@ // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017-2018 Bartosz Golaszewski + * Copyright (C) 2020 Bartosz Golaszewski */ -#include -#include #include +#include +#include +#include +#include + +struct irq_sim_work_ctx { + struct irq_work work; + int irq_base; + unsigned int irq_count; + unsigned long *pending; + struct irq_domain *domain; +}; + +struct irq_sim_irq_ctx { + int irqnum; + bool enabled; + struct irq_sim_work_ctx *work_ctx; +}; struct irq_sim_devres { - struct irq_sim *sim; + struct irq_domain *domain; }; static void irq_sim_irqmask(struct irq_data *data) @@ -36,159 +53,205 @@ static int irq_sim_set_type(struct irq_data *data, unsigned int type) return 0; } +static int irq_sim_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) + *state = test_bit(hwirq, irq_ctx->work_ctx->pending); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int irq_sim_set_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) { + assign_bit(hwirq, irq_ctx->work_ctx->pending, state); + if (state) + irq_work_queue(&irq_ctx->work_ctx->work); + } + break; + default: + return -EINVAL; + } + + return 0; +} + static struct irq_chip irq_sim_irqchip = { - .name = "irq_sim", - .irq_mask = irq_sim_irqmask, - .irq_unmask = irq_sim_irqunmask, - .irq_set_type = irq_sim_set_type, + .name = "irq_sim", + .irq_mask = irq_sim_irqmask, + .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, + .irq_get_irqchip_state = irq_sim_get_irqchip_state, + .irq_set_irqchip_state = irq_sim_set_irqchip_state, }; static void irq_sim_handle_irq(struct irq_work *work) { struct irq_sim_work_ctx *work_ctx; unsigned int offset = 0; - struct irq_sim *sim; int irqnum; work_ctx = container_of(work, struct irq_sim_work_ctx, work); - sim = container_of(work_ctx, struct irq_sim, work_ctx); - while (!bitmap_empty(work_ctx->pending, sim->irq_count)) { + while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) { offset = find_next_bit(work_ctx->pending, - sim->irq_count, offset); + work_ctx->irq_count, offset); clear_bit(offset, work_ctx->pending); - irqnum = irq_sim_irqnum(sim, offset); + irqnum = irq_find_mapping(work_ctx->domain, offset); handle_simple_irq(irq_to_desc(irqnum)); } } +static int irq_sim_domain_map(struct irq_domain *domain, + unsigned int virq, irq_hw_number_t hw) +{ + struct irq_sim_work_ctx *work_ctx = domain->host_data; + struct irq_sim_irq_ctx *irq_ctx; + + irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL); + if (!irq_ctx) + return -ENOMEM; + + irq_set_chip(virq, &irq_sim_irqchip); + irq_set_chip_data(virq, irq_ctx); + irq_set_handler(virq, handle_simple_irq); + irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); + irq_ctx->work_ctx = work_ctx; + + return 0; +} + +static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq) +{ + struct irq_sim_irq_ctx *irq_ctx; + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + irq_ctx = irq_data_get_irq_chip_data(irqd); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(irqd); + kfree(irq_ctx); +} + +static const struct irq_domain_ops irq_sim_domain_ops = { + .map = irq_sim_domain_map, + .unmap = irq_sim_domain_unmap, +}; + /** - * irq_sim_init - Initialize the interrupt simulator: allocate a range of - * dummy interrupts. + * irq_domain_create_sim - Create a new interrupt simulator irq_domain and + * allocate a range of dummy interrupts. * - * @sim: The interrupt simulator object to initialize. - * @num_irqs: Number of interrupts to allocate + * @fnode: struct fwnode_handle to be associated with this domain. + * @num_irqs: Number of interrupts to allocate. * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) +struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, + unsigned int num_irqs) { - int i; + struct irq_sim_work_ctx *work_ctx; - sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL); - if (!sim->irqs) - return -ENOMEM; + work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL); + if (!work_ctx) + goto err_out; - sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); - if (sim->irq_base < 0) { - kfree(sim->irqs); - return sim->irq_base; - } + work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!work_ctx->pending) + goto err_free_work_ctx; - sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL); - if (!sim->work_ctx.pending) { - kfree(sim->irqs); - irq_free_descs(sim->irq_base, num_irqs); - return -ENOMEM; - } + work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs, + &irq_sim_domain_ops, + work_ctx); + if (!work_ctx->domain) + goto err_free_bitmap; - for (i = 0; i < num_irqs; i++) { - sim->irqs[i].irqnum = sim->irq_base + i; - sim->irqs[i].enabled = false; - irq_set_chip(sim->irq_base + i, &irq_sim_irqchip); - irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]); - irq_set_handler(sim->irq_base + i, &handle_simple_irq); - irq_modify_status(sim->irq_base + i, - IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); - } + work_ctx->irq_count = num_irqs; + init_irq_work(&work_ctx->work, irq_sim_handle_irq); - init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); - sim->irq_count = num_irqs; + return work_ctx->domain; - return sim->irq_base; +err_free_bitmap: + bitmap_free(work_ctx->pending); +err_free_work_ctx: + kfree(work_ctx); +err_out: + return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL_GPL(irq_sim_init); +EXPORT_SYMBOL_GPL(irq_domain_create_sim); /** - * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt - * descriptors and allocated memory. + * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free + * the interrupt descriptors and allocated memory. * - * @sim: The interrupt simulator to tear down. + * @domain: The interrupt simulator domain to tear down. */ -void irq_sim_fini(struct irq_sim *sim) +void irq_domain_remove_sim(struct irq_domain *domain) { - irq_work_sync(&sim->work_ctx.work); - bitmap_free(sim->work_ctx.pending); - irq_free_descs(sim->irq_base, sim->irq_count); - kfree(sim->irqs); + struct irq_sim_work_ctx *work_ctx = domain->host_data; + + irq_work_sync(&work_ctx->work); + bitmap_free(work_ctx->pending); + kfree(work_ctx); + + irq_domain_remove(domain); } -EXPORT_SYMBOL_GPL(irq_sim_fini); +EXPORT_SYMBOL_GPL(irq_domain_remove_sim); -static void devm_irq_sim_release(struct device *dev, void *res) +static void devm_irq_domain_release_sim(struct device *dev, void *res) { struct irq_sim_devres *this = res; - irq_sim_fini(this->sim); + irq_domain_remove_sim(this->domain); } /** - * irq_sim_init - Initialize the interrupt simulator for a managed device. + * devm_irq_domain_create_sim - Create a new interrupt simulator for + * a managed device. * * @dev: Device to initialize the simulator object for. - * @sim: The interrupt simulator object to initialize. + * @fnode: struct fwnode_handle to be associated with this domain. * @num_irqs: Number of interrupts to allocate * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, - unsigned int num_irqs) +struct irq_domain *devm_irq_domain_create_sim(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs) { struct irq_sim_devres *dr; - int rv; - dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc(devm_irq_domain_release_sim, + sizeof(*dr), GFP_KERNEL); if (!dr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - rv = irq_sim_init(sim, num_irqs); - if (rv < 0) { + dr->domain = irq_domain_create_sim(fwnode, num_irqs); + if (IS_ERR(dr->domain)) { devres_free(dr); - return rv; + return dr->domain; } - dr->sim = sim; devres_add(dev, dr); - - return rv; -} -EXPORT_SYMBOL_GPL(devm_irq_sim_init); - -/** - * irq_sim_fire - Enqueue an interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt which should be fired. - */ -void irq_sim_fire(struct irq_sim *sim, unsigned int offset) -{ - if (sim->irqs[offset].enabled) { - set_bit(offset, sim->work_ctx.pending); - irq_work_queue(&sim->work_ctx.work); - } -} -EXPORT_SYMBOL_GPL(irq_sim_fire); - -/** - * irq_sim_irqnum - Get the allocated number of a dummy interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt for which to retrieve - * the number. - */ -int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset) -{ - return sim->irqs[offset].irqnum; + return dr->domain; } -EXPORT_SYMBOL_GPL(irq_sim_irqnum); +EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); -- cgit v1.2.3 From fdb1b5e08929f21d29aedd447233b32f497b8999 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 18 May 2020 06:19:25 -0600 Subject: Revert "docs: sysctl/kernel: document ngroups_max" This reverts commit 2f4c33063ad713e3a5b63002cf8362846e78bd71. The changes here were fine, but there's a non-documentation change to sysctl.c that makes messes elsewhere; those changes should have been done independently. Signed-off-by: Jonathan Corbet --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2ba9f449d273..8a176d8727a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -146,7 +146,7 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static int maxolduid = 65535; static int minolduid; -static const int ngroups_max = NGROUPS_MAX; +static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; /* @@ -883,7 +883,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "ngroups_max", - .data = (void *)&ngroups_max, + .data = &ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, -- cgit v1.2.3 From 29da4f91c0c1fbda12b8a31be0d564930208c92e Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 14 May 2020 16:47:39 +0530 Subject: powerpc/watchpoint: Don't allow concurrent perf and ptrace events With Book3s DAWR, ptrace and perf watchpoints on powerpc behaves differently. Ptrace watchpoint works in one-shot mode and generates signal before executing instruction. It's ptrace user's job to single-step the instruction and re-enable the watchpoint. OTOH, in case of perf watchpoint, kernel emulates/single-steps the instruction and then generates event. If perf and ptrace creates two events with same or overlapping address ranges, it's ambiguous to decide who should single-step the instruction. Because of this issue, don't allow perf and ptrace watchpoint at the same time if their address range overlaps. Signed-off-by: Ravi Bangoria Signed-off-by: Michael Ellerman Reviewed-by: Michael Neuling Link: https://lore.kernel.org/r/20200514111741.97993-15-ravi.bangoria@linux.ibm.com --- kernel/events/hw_breakpoint.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3cc8416ec844..b48d7039a015 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -213,6 +213,15 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, list_del(&bp->hw.bp_list); } +__weak int arch_reserve_bp_slot(struct perf_event *bp) +{ + return 0; +} + +__weak void arch_release_bp_slot(struct perf_event *bp) +{ +} + /* * Function to perform processor-specific cleanup during unregistration */ @@ -270,6 +279,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) struct bp_busy_slots slots = {0}; enum bp_type_idx type; int weight; + int ret; /* We couldn't initialize breakpoint constraints on boot */ if (!constraints_initialized) @@ -294,6 +304,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; + ret = arch_reserve_bp_slot(bp); + if (ret) + return ret; + toggle_bp_slot(bp, true, type, weight); return 0; @@ -317,6 +331,8 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) enum bp_type_idx type; int weight; + arch_release_bp_slot(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); -- cgit v1.2.3 From 202164fbfa2b2ffa3e66b504e0f126ba9a745006 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:39 -0700 Subject: kgdb: Disable WARN_CONSOLE_UNLOCKED for all kgdb In commit 81eaadcae81b ("kgdboc: disable the console lock when in kgdb") we avoided the WARN_CONSOLE_UNLOCKED() yell when we were in kgdboc. That still works fine, but it turns out that we get a similar yell when using other I/O drivers. One example is the "I/O driver" for the kgdb test suite (kgdbts). When I enabled that I again got the same yells. Even though "kgdbts" doesn't actually interact with the user over the console, using it still causes kgdb to print to the consoles. That trips the same warning: con_is_visible+0x60/0x68 con_scroll+0x110/0x1b8 lf+0x4c/0xc8 vt_console_print+0x1b8/0x348 vkdb_printf+0x320/0x89c kdb_printf+0x68/0x90 kdb_main_loop+0x190/0x860 kdb_stub+0x2cc/0x3ec kgdb_cpu_enter+0x268/0x744 kgdb_handle_exception+0x1a4/0x200 kgdb_compiled_brk_fn+0x34/0x44 brk_handler+0x7c/0xb8 do_debug_exception+0x1b4/0x228 Let's increment/decrement the "ignore_console_lock_warning" variable all the time when we enter the debugger. This will allow us to later revert commit 81eaadcae81b ("kgdboc: disable the console lock when in kgdb"). Signed-off-by: Douglas Anderson Reviewed-by: Greg Kroah-Hartman Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20200507130644.v4.1.Ied2b058357152ebcc8bf68edd6f20a11d98d7d4e@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2266ba27f27d..b542f36499c6 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -666,6 +666,8 @@ return_normal: if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) goto kgdb_restore; + atomic_inc(&ignore_console_lock_warning); + /* Call the I/O driver's pre_exception routine */ if (dbg_io_ops->pre_exception) dbg_io_ops->pre_exception(); @@ -738,6 +740,8 @@ cpu_master_loop: if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); + atomic_dec(&ignore_console_lock_warning); + if (!kgdb_single_step) { raw_spin_unlock(&dbg_slave_lock); /* Wait till all the CPUs have quit from the debugger. */ -- cgit v1.2.3 From 51189c7a7ed1b4ed4493e27275d466ff60406d3a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 14:11:05 +0100 Subject: arm64: scs: Store absolute SCS stack pointer value in thread_info Storing the SCS information in thread_info as a {base,offset} pair introduces an additional load instruction on the ret-to-user path, since the SCS stack pointer in x18 has to be converted back to an offset by subtracting the base. Replace the offset with the absolute SCS stack pointer value instead and avoid the redundant load. Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/scs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 9389c28f0853..5ff8663e4a67 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -60,8 +60,7 @@ int scs_prepare(struct task_struct *tsk, int node) if (!s) return -ENOMEM; - task_scs(tsk) = s; - task_scs_offset(tsk) = 0; + task_scs(tsk) = task_scs_sp(tsk) = s; scs_account(tsk, 1); return 0; } -- cgit v1.2.3 From bee348fab099b0f551caa874663e82a7f3bb64b3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 14:43:11 +0100 Subject: scs: Move accounting into alloc/free functions There's no need to perform the shadow stack page accounting independently of the lifetime of the underlying allocation, so call the accounting code from the {alloc,free}() functions and simplify the code in the process. Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/scs.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 5ff8663e4a67..aea841cd7586 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -14,25 +14,35 @@ static struct kmem_cache *scs_cache; +static void __scs_account(void *s, int account) +{ + struct page *scs_page = virt_to_page(s); + + mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / SZ_1K)); +} + static void *scs_alloc(int node) { - void *s; - - s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); - if (s) { - *__scs_magic(s) = SCS_END_MAGIC; - /* - * Poison the allocation to catch unintentional accesses to - * the shadow stack when KASAN is enabled. - */ - kasan_poison_object_data(scs_cache, s); - } + void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + + if (!s) + return NULL; + *__scs_magic(s) = SCS_END_MAGIC; + + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + __scs_account(s, 1); return s; } static void scs_free(void *s) { + __scs_account(s, -1); kasan_unpoison_object_data(scs_cache, s); kmem_cache_free(scs_cache, s); } @@ -42,17 +52,6 @@ void __init scs_init(void) scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); } -static struct page *__scs_page(struct task_struct *tsk) -{ - return virt_to_page(task_scs(tsk)); -} - -static void scs_account(struct task_struct *tsk, int account) -{ - mod_zone_page_state(page_zone(__scs_page(tsk)), NR_KERNEL_SCS_KB, - account * (SCS_SIZE / 1024)); -} - int scs_prepare(struct task_struct *tsk, int node) { void *s = scs_alloc(node); @@ -61,7 +60,6 @@ int scs_prepare(struct task_struct *tsk, int node) return -ENOMEM; task_scs(tsk) = task_scs_sp(tsk) = s; - scs_account(tsk, 1); return 0; } @@ -102,6 +100,5 @@ void scs_release(struct task_struct *tsk) WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); scs_check_usage(tsk); - scs_account(tsk, -1); scs_free(s); } -- cgit v1.2.3 From 88485be531f4aee841ddc53b56e2f6e6a338854d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 14:56:05 +0100 Subject: scs: Move scs_overflow_check() out of architecture code There is nothing architecture-specific about scs_overflow_check() as it's just a trivial wrapper around scs_corrupted(). For parity with task_stack_end_corrupted(), rename scs_corrupted() to task_scs_end_corrupted() and call it from schedule_debug() when CONFIG_SCHED_STACK_END_CHECK_is enabled, which better reflects its purpose as a debug feature to catch inadvertent overflow of the SCS. Finally, remove the unused scs_overflow_check() function entirely. This has absolutely no impact on architectures that do not support SCS (currently arm64 only). Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/sched/core.c | 3 +++ kernel/scs.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 934e03cfaec7..a1d815a11b90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3878,6 +3878,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP diff --git a/kernel/scs.c b/kernel/scs.c index aea841cd7586..faf0ecd7b893 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -98,7 +98,8 @@ void scs_release(struct task_struct *tsk) if (!s) return; - WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + WARN(task_scs_end_corrupted(tsk), + "corrupted shadow stack detected when freeing task\n"); scs_check_usage(tsk); scs_free(s); } -- cgit v1.2.3 From aa7a65ae5b8f459617e5ed1422301386e7f12274 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 16:15:46 +0100 Subject: scs: Remove references to asm/scs.h from core code asm/scs.h is no longer needed by the core code, so remove a redundant header inclusion and update the stale Kconfig text. Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/scs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index faf0ecd7b893..222a7a9ad543 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -10,7 +10,6 @@ #include #include #include -#include static struct kmem_cache *scs_cache; -- cgit v1.2.3 From b1a57bbfcc17c87e5cc76695ebb0565380c7501a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:42 -0700 Subject: kgdb: Delay "kgdbwait" to dbg_late_init() by default Using kgdb requires at least some level of architecture-level initialization. If nothing else, it relies on the architecture to pass breakpoints / crashes onto kgdb. On some architectures this all works super early, specifically it starts working at some point in time before Linux parses early_params's. On other architectures it doesn't. A survey of a few platforms: a) x86: Presumably it all works early since "ekgdboc" is documented to work here. b) arm64: Catching crashes works; with a simple patch breakpoints can also be made to work. c) arm: Nothing in kgdb works until paging_init() -> devicemaps_init() -> early_trap_init() Let's be conservative and, by default, process "kgdbwait" (which tells the kernel to drop into the debugger ASAP at boot) a bit later at dbg_late_init() time. If an architecture has tested it and wants to re-enable super early debugging, they can select the ARCH_HAS_EARLY_DEBUG KConfig option. We'll do this for x86 to start. It should be noted that dbg_late_init() is still called quite early in the system. Note that this patch doesn't affect when kgdb runs its init. If kgdb is set to initialize early it will still initialize when parsing early_param's. This patch _only_ inhibits the initial breakpoint from "kgdbwait". This means: * Without any extra patches arm64 platforms will at least catch crashes after kgdb inits. * arm platforms will catch crashes (and could handle a hardcoded kgdb_breakpoint()) any time after early_trap_init() runs, even before dbg_late_init(). Signed-off-by: Douglas Anderson Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20200507130644.v4.4.I3113aea1b08d8ce36dc3720209392ae8b815201b@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b542f36499c6..1cd8e8b41115 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -948,6 +948,14 @@ void kgdb_panic(const char *msg) kgdb_breakpoint(); } +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + pr_crit("Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + void __weak kgdb_arch_late(void) { } @@ -958,6 +966,9 @@ void __init dbg_late_init(void) if (kgdb_io_module_registered) kgdb_arch_late(); kdb_init(KDB_INIT_FULL); + + if (kgdb_io_module_registered && kgdb_break_asap) + kgdb_initial_breakpoint(); } static int @@ -1053,14 +1064,6 @@ void kgdb_schedule_breakpoint(void) } EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); -static void kgdb_initial_breakpoint(void) -{ - kgdb_break_asap = 0; - - pr_crit("Waiting for connection from remote gdb...\n"); - kgdb_breakpoint(); -} - /** * kgdb_register_io_module - register KGDB IO module * @new_dbg_io_ops: the io ops vector @@ -1097,7 +1100,8 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) /* Arm KGDB now. */ kgdb_register_callbacks(); - if (kgdb_break_asap) + if (kgdb_break_asap && + (!dbg_is_early || IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG))) kgdb_initial_breakpoint(); return 0; @@ -1167,7 +1171,8 @@ static int __init opt_kgdb_wait(char *str) kgdb_break_asap = 1; kdb_init(KDB_INIT_EARLY); - if (kgdb_io_module_registered) + if (kgdb_io_module_registered && + IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG)) kgdb_initial_breakpoint(); return 0; -- cgit v1.2.3 From 3ca676e4ca60d1834bb77535dafe24169cadacef Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:44 -0700 Subject: kgdb: Prevent infinite recursive entries to the debugger If we detect that we recursively entered the debugger we should hack our I/O ops to NULL so that the panic() in the next line won't actually cause another recursion into the debugger. The first line of kgdb_panic() will check this and return. Signed-off-by: Douglas Anderson Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20200507130644.v4.6.I89de39f68736c9de610e6f241e68d8dbc44bc266@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1cd8e8b41115..e2d67b163fb6 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -530,6 +530,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) if (exception_level > 1) { dump_stack(); + kgdb_io_module_registered = false; panic("Recursive entry to debugger"); } -- cgit v1.2.3 From 220995622da5317714b5fe659165735f7b44b87e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:46 -0700 Subject: kgdboc: Add kgdboc_earlycon to support early kgdb using boot consoles We want to enable kgdb to debug the early parts of the kernel. Unfortunately kgdb normally is a client of the tty API in the kernel and serial drivers don't register to the tty layer until fairly late in the boot process. Serial drivers do, however, commonly register a boot console. Let's enable the kgdboc driver to work with boot consoles to provide early debugging. This change co-opts the existing read() function pointer that's part of "struct console". It's assumed that if a boot console (with the flag CON_BOOT) has implemented read() that both the read() and write() function are polling functions. That means they work without interrupts and read() will return immediately (with 0 bytes read) if there's nothing to read. This should be a safe assumption since it appears that no current boot consoles implement read() right now and there seems no reason to do so unless they wanted to support "kgdboc_earlycon". The normal/expected way to make all this work is to use "kgdboc_earlycon" and "kgdboc" together. You should point them both to the same physical serial connection. At boot time, as the system transitions from the boot console to the normal console (and registers a tty), kgdb will switch over. One awkward part of all this, though, is that there can be a window where the boot console goes away and we can't quite transtion over to the main kgdboc that uses the tty layer. There are two main problems: 1. The act of registering the tty doesn't cause any call into kgdboc so there is a window of time when the tty is there but kgdboc's init code hasn't been called so we can't transition to it. 2. On some serial drivers the normal console inits (and replaces the boot console) quite early in the system. Presumably these drivers were coded up before earlycon worked as well as it does today and probably they don't need to do this anymore, but it causes us problems nontheless. Problem #1 is not too big of a deal somewhat due to the luck of probe ordering. kgdboc is last in the tty/serial/Makefile so its probe gets right after all other tty devices. It's not fun to rely on this, but it does work for the most part. Problem #2 is a big deal, but only for some serial drivers. Other serial drivers end up registering the console (which gets rid of the boot console) and tty at nearly the same time. The way we'll deal with the window when the system has stopped using the boot console and the time when we're setup using the tty is to keep using the boot console. This may sound surprising, but it has been found to work well in practice. If it doesn't work, it shouldn't be too hard for a given serial driver to make it keep working. Specifically, it's expected that the read()/write() function provided in the boot console should be the same (or nearly the same) as the normal kgdb polling functions. That means continuing to use them should work just fine. To make things even more likely to work work we'll also trap the recently added exit() function in the boot console we're using and delay any calls to it until we're all done with the boot console. NOTE: there could be ways to use all this in weird / unexpected ways. If you do something like this, it's a bit of a buyer beware situation. Specifically: - If you specify only "kgdboc_earlycon" but not "kgdboc" then (depending on your serial driver) things will probably work OK, but you'll get a warning printed the first time you use kgdb after the boot console is gone. You'd only be able to do this, of course, if the serial driver you're running atop provided an early boot console. - If your "kgdboc_earlycon" and "kgdboc" devices are not the same device things should work OK, but it'll be your job to switch over which device you're monitoring (including figuring out how to switch over gdb in-flight if you're using it). When trying to enable "kgdboc_earlycon" it should be noted that the names that are registered through the boot console layer and the tty layer are not the same for the same port. For example when debugging on one board I'd need to pass "kgdboc_earlycon=qcom_geni kgdboc=ttyMSM0" to enable things properly. Since digging up the boot console name is a pain and there will rarely be more than one boot console enabled, you can provide the "kgdboc_earlycon" parameter without specifying the name of the boot console. In this case we'll just pick the first boot that implements read() that we find. This new "kgdboc_earlycon" parameter should be contrasted to the existing "ekgdboc" parameter. While both provide a way to debug very early, the usage and mechanisms are quite different. Specifically "kgdboc_earlycon" is meant to be used in tandem with "kgdboc" and there is a transition from one to the other. The "ekgdboc" parameter, on the other hand, replaces the "kgdboc" parameter. It runs the same logic as the "kgdboc" parameter but just relies on your TTY driver being present super early. The only known usage of the old "ekgdboc" parameter is documented as "ekgdboc=kbd earlyprintk=vga". It should be noted that "kbd" has special treatment allowing it to init early as a tty device. Signed-off-by: Douglas Anderson Reviewed-by: Greg Kroah-Hartman Tested-by: Sumit Garg Link: https://lore.kernel.org/r/20200507130644.v4.8.I8fba5961bf452ab92350654aa61957f23ecf0100@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index e2d67b163fb6..4d59aa907fdc 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1073,15 +1073,23 @@ EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); */ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) { + struct kgdb_io *old_dbg_io_ops; int err; spin_lock(&kgdb_registration_lock); - if (dbg_io_ops) { - spin_unlock(&kgdb_registration_lock); + old_dbg_io_ops = dbg_io_ops; + if (old_dbg_io_ops) { + if (!old_dbg_io_ops->deinit) { + spin_unlock(&kgdb_registration_lock); - pr_err("Another I/O driver is already registered with KGDB\n"); - return -EBUSY; + pr_err("KGDB I/O driver %s can't replace %s.\n", + new_dbg_io_ops->name, old_dbg_io_ops->name); + return -EBUSY; + } + pr_info("Replacing I/O driver %s with %s\n", + old_dbg_io_ops->name, new_dbg_io_ops->name); + old_dbg_io_ops->deinit(); } if (new_dbg_io_ops->init) { @@ -1096,6 +1104,9 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops) + return 0; + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ @@ -1132,6 +1143,9 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops->deinit) + old_dbg_io_ops->deinit(); + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } -- cgit v1.2.3 From f83b04d36e52cc3d941120ec859374fcda36eb31 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 16 Apr 2020 10:38:04 +0800 Subject: kgdb: Add kgdb_has_hit_break function The break instruction in RISC-V does not have an immediate value field, so the kernel cannot identify the purpose of each trap exception through the opcode. This makes the existing identification schemes in other architecture unsuitable for the RISC-V kernel. To solve this problem, this patch adds kgdb_has_hit_break(), which can help RISC-V kernel identify the KGDB trap exception. Signed-off-by: Vincent Chen Reviewed-by: Palmer Dabbelt Acked-by: Daniel Thompson Signed-off-by: Palmer Dabbelt --- kernel/debug/debug_core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2b7c9b67931d..01bc3eea3d4d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -417,6 +417,18 @@ int kgdb_isremovedbreak(unsigned long addr) return 0; } +int kgdb_has_hit_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_ACTIVE && + kgdb_break[i].bpt_addr == addr) + return 1; + } + return 0; +} + int dbg_remove_all_break(void) { int error; -- cgit v1.2.3 From 2318976619daf0e868de5b8aff19c1fd8d585867 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 14 May 2020 11:36:41 +0100 Subject: ARM: 8976/1: module: allow arch overrides for .init section names ARM stores unwind information for .init.text in sections named .ARM.extab.init.text and .ARM.exidx.init.text. Since those aren't currently recognized as init sections, they're allocated along with the core section, and relocation fails if the core and the init section are allocated from different regions and can't reach other. final section addresses: ... 0x7f800000 .init.text .. 0xcbb54078 .ARM.exidx.init.text .. section 16 reloc 0 sym '': relocation 42 out of range (0xcbb54078 -> 0x7f800000) Allow architectures to override the section name so that ARM can fix this. Acked-by: Jessica Yu Signed-off-by: Vincent Whitchurch Signed-off-by: Russell King --- kernel/module.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..d29c23d07aff 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2400,7 +2400,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(sname, ".init")) + || module_init_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2433,7 +2433,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(sname, ".init")) + || !module_init_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2768,6 +2768,11 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } +bool __weak module_init_section(const char *name) +{ + return strstarts(name, ".init"); +} + bool __weak module_exit_section(const char *name) { return strstarts(name, ".exit"); -- cgit v1.2.3 From 9d78edeaec759f997c303f286ecd39daee166f2a Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 18 May 2020 20:07:38 +0200 Subject: proc: proc_pid_ns takes super_block as an argument syzbot found that touch /proc/testfile causes NULL pointer dereference at tomoyo_get_local_path() because inode of the dentry is NULL. Before c59f415a7cb6, Tomoyo received pid_ns from proc's s_fs_info directly. Since proc_pid_ns() can only work with inode, using it in the tomoyo_get_local_path() was wrong. To avoid creating more functions for getting proc_ns, change the argument type of the proc_pid_ns() function. Then, Tomoyo can use the existing super_block to get pid_ns. Link: https://lkml.kernel.org/r/0000000000002f0c7505a5b0e04c@google.com Link: https://lkml.kernel.org/r/20200518180738.2939611-1-gladkov.alexey@gmail.com Reported-by: syzbot+c1af344512918c61362c@syzkaller.appspotmail.com Fixes: c59f415a7cb6 ("Use proc_pid_ns() to get pid_namespace from the proc superblock") Signed-off-by: Alexey Gladkov Signed-off-by: Eric W. Biederman --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4385f3d639f2..e7bdaccad942 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1745,7 +1745,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { - ns = proc_pid_ns(file_inode(m->file)); + ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } -- cgit v1.2.3 From 0995a5dfbe49badff78e78761fb66f46579f2f9a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 13:09:50 +0100 Subject: tracing: Provide lockdep less trace_hardirqs_on/off() variants trace_hardirqs_on/off() is only partially safe vs. RCU idle. The tracer core itself is safe, but the resulting tracepoints can be utilized by e.g. BPF which is unsafe. Provide variants which do not contain the lockdep invocation so the lockdep and tracer invocations can be split at the call site and placed properly. This is required because lockdep needs to be aware of the state before switching away from RCU idle and after switching to RCU idle because these transitions can take locks. As these code pathes are going to be non-instrumentable the tracer can be invoked after RCU is turned on and before the switch to RCU idle. So for these new variants there is no need to invoke the rcuidle aware tracer functions. Name them so they match the lockdep counterparts. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.270771162@linutronix.de --- kernel/trace/trace_preemptirq.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 4d8e99fdbbbe..c00880162b06 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -19,6 +19,24 @@ /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); +/* + * Like trace_hardirqs_on() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_on_prepare(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } +} +EXPORT_SYMBOL(trace_hardirqs_on_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); + void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { @@ -33,6 +51,25 @@ void trace_hardirqs_on(void) EXPORT_SYMBOL(trace_hardirqs_on); NOKPROBE_SYMBOL(trace_hardirqs_on); +/* + * Like trace_hardirqs_off() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_off_prepare(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + } + +} +EXPORT_SYMBOL(trace_hardirqs_off_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_off_prepare); + void trace_hardirqs_off(void) { if (!this_cpu_read(tracing_irq_cpu)) { -- cgit v1.2.3 From c86e9b987cea3dd0209203e714553a47f5d7c6dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Mar 2020 14:22:03 +0100 Subject: lockdep: Prepare for noinstr sections Force inlining and prevent instrumentation of all sorts by marking the functions which are invoked from low level entry code with 'noinstr'. Split the irqflags tracking into two parts. One which does the heavy lifting while RCU is watching and the final one which can be invoked after RCU is turned off. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.484532537@linutronix.de --- kernel/locking/lockdep.c | 86 +++++++++++++++++++++++++++++++---------- kernel/trace/trace_preemptirq.c | 2 + 2 files changed, 67 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..9ccd675a8b5a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3635,13 +3635,10 @@ mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) /* * Hardirqs will be enabled: */ -static void __trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(void) { struct task_struct *curr = current; - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: @@ -3654,15 +3651,19 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); } -void lockdep_hardirqs_on(unsigned long ip) +/** + * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts + * @ip: Caller address + * + * Invoked before a possible transition to RCU idle from exit to user or + * guest mode. This ensures that all RCU operations are done before RCU + * stops watching. After the RCU transition lockdep_hardirqs_on() has to be + * invoked to set the final state. + */ +void lockdep_hardirqs_on_prepare(unsigned long ip) { if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3698,20 +3699,62 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; + current->hardirq_chain_key = current->curr_chain_key; + current->lockdep_recursion++; - __trace_hardirqs_on_caller(ip); + __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } -NOKPROBE_SYMBOL(lockdep_hardirqs_on); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); + +void noinstr lockdep_hardirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || curr->lockdep_recursion)) + return; + + if (curr->hardirqs_enabled) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + /* + * Ensure the lock stack remained unchanged between + * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on(). + */ + DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != + current->curr_chain_key); + + /* we'll do an OFF -> ON transition: */ + curr->hardirqs_enabled = 1; + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_on_events); +} +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); /* * Hardirqs were disabled: */ -void lockdep_hardirqs_off(unsigned long ip) +void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks || curr->lockdep_recursion)) return; /* @@ -3729,10 +3772,11 @@ void lockdep_hardirqs_off(unsigned long ip) curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); - } else + } else { debug_atomic_inc(redundant_hardirqs_off); + } } -NOKPROBE_SYMBOL(lockdep_hardirqs_off); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -4408,8 +4452,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, dump_stack(); } -static int match_held_lock(const struct held_lock *hlock, - const struct lockdep_map *lock) +static noinstr int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; @@ -4696,7 +4740,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) return 0; } -static nokprobe_inline +static __always_inline int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; @@ -4956,7 +5000,7 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(const struct lockdep_map *lock, int read) +noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index c00880162b06..fb0691b8a88d 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -46,6 +46,7 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); @@ -93,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); -- cgit v1.2.3 From 8c4e93c362ff114def211d4629b120af86eb1275 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 24 Feb 2020 13:13:31 +0100 Subject: printk: Prepare for nested printk_nmi_enter() There is plenty of space in the printk_context variable. Reserve one byte there for the NMI context to be on the safe side. It should never overflow. The BUG_ON(in_nmi() == NMI_MASK) in nmi_enter() will trigger much earlier. Signed-off-by: Petr Mladek Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.681374113@linutronix.de --- kernel/printk/internal.h | 8 +++++--- kernel/printk/printk_safe.c | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index b2b0f526f249..660f9a6bf73a 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,9 +6,11 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff -#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 -#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff +#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 +#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 + +#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 extern raw_spinlock_t logbuf_lock; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d9a659a686f3..e8791f206417 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -295,12 +295,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) void notrace printk_nmi_enter(void) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } void notrace printk_nmi_exit(void) { - this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); + this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } /* -- cgit v1.2.3 From b0f51883f551b900a04a80f49fb0886caf7e9a12 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:25:03 +0100 Subject: printk: Disallow instrumenting print_nmi_enter() It happens early in nmi_enter(), no tracing, probing or other funnies allowed. Specifically as nmi_enter() will be used in do_debug(), which would cause recursive exceptions when kprobed. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.139720912@linutronix.de --- kernel/printk/printk_safe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index e8791f206417..4242403316bb 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internal.h" @@ -293,12 +294,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) return printk_safe_log_store(s, fmt, args); } -void notrace printk_nmi_enter(void) +void noinstr printk_nmi_enter(void) { this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } -void notrace printk_nmi_exit(void) +void noinstr printk_nmi_exit(void) { this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } -- cgit v1.2.3 From e616cb8daadf637175af4fe53138a94c190c4816 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:14:51 +0100 Subject: lockdep: Always inline lockdep_{off,on}() These functions are called {early,late} in nmi_{enter,exit} and should not be traced or probed. They are also puny, so 'inline' them. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.048523500@linutronix.de --- kernel/locking/lockdep.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..6f1c8cba09c6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,25 +393,6 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -/* - * Split the recrursion counter in two to readily detect 'off' vs recursion. - */ -#define LOCKDEP_RECURSION_BITS 16 -#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) -#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) - -void lockdep_off(void) -{ - current->lockdep_recursion += LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion -= LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_on); - static inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) -- cgit v1.2.3 From 178ba00c354eb15cec6806a812771e60a5ae3ea1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:26:21 +0100 Subject: sh/ftrace: Move arch_ftrace_nmi_{enter,exit} into nmi exception SuperH is the last remaining user of arch_ftrace_nmi_{enter,exit}(), remove it from the generic code and into the SuperH code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Steven Rostedt (VMware) Cc: Rich Felker Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200505134101.248881738@linutronix.de --- kernel/trace/Kconfig | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1e9a8f9a3459..24876faac753 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool -config HAVE_FTRACE_NMI_ENTER - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_FUNCTION_TRACER bool help @@ -72,11 +67,6 @@ config RING_BUFFER select TRACE_CLOCK select IRQ_WORK -config FTRACE_NMI_ENTER - bool - depends on HAVE_FTRACE_NMI_ENTER - default y - config EVENT_TRACING select CONTEXT_SWITCH_TRACER select GLOB -- cgit v1.2.3 From ff5c4f5cad33061b07c3fb9187506783c0f3cb66 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Mar 2020 17:32:17 +0100 Subject: rcu/tree: Mark the idle relevant functions noinstr These functions are invoked from context tracking and other places in the low level entry code. Move them into the .noinstr.text section to exclude them from instrumentation. Mark the places which are safe to invoke traceable functions with instrumentation_begin/end() so objtool won't complain. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: https://lkml.kernel.org/r/20200505134100.575356107@linutronix.de --- kernel/rcu/tree.c | 83 +++++++++++++++++++++++++++--------------------- kernel/rcu/tree_plugin.h | 4 +-- kernel/rcu/update.c | 3 +- 3 files changed, 49 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f288477ee1c2..0713ef3fa560 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -88,9 +88,6 @@ */ #define RCU_DYNTICK_CTRL_MASK 0x1 #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) -#ifndef rcu_eqs_special_exit -#define rcu_eqs_special_exit() do { } while (0) -#endif static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, @@ -242,7 +239,7 @@ void rcu_softirq_qs(void) * RCU is watching prior to the call to this function and is no longer * watching upon return. */ -static void rcu_dynticks_eqs_enter(void) +static noinstr void rcu_dynticks_eqs_enter(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -267,7 +264,7 @@ static void rcu_dynticks_eqs_enter(void) * called from an extended quiescent state, that is, RCU is not watching * prior to the call to this function and is watching upon return. */ -static void rcu_dynticks_eqs_exit(void) +static noinstr void rcu_dynticks_eqs_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -285,8 +282,6 @@ static void rcu_dynticks_eqs_exit(void) if (seq & RCU_DYNTICK_CTRL_MASK) { atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ - /* Prefer duplicate flushes to losing a flush. */ - rcu_eqs_special_exit(); } } @@ -314,7 +309,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -static bool rcu_dynticks_curr_cpu_in_eqs(void) +static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -603,7 +598,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter(bool user) +static noinstr void rcu_eqs_enter(bool user) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -618,12 +613,14 @@ static void rcu_eqs_enter(bool user) } lockdep_assert_irqs_disabled(); + instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... rcu_dynticks_eqs_enter(); @@ -660,7 +657,7 @@ void rcu_idle_enter(void) * If you add or remove a call to rcu_user_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_enter(void) +noinstr void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); @@ -693,19 +690,23 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { + instrumentation_begin(); trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); + instrumentation_end(); return; } + instrumentation_begin(); /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (irq) rcu_prepare_for_idle(); + instrumentation_end(); // RCU is watching here ... rcu_dynticks_eqs_enter(); @@ -721,7 +722,7 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_exit(void) +void noinstr rcu_nmi_exit(void) { rcu_nmi_exit_common(false); } @@ -745,7 +746,7 @@ void rcu_nmi_exit(void) * If you add or remove a call to rcu_irq_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_exit(void) +void noinstr rcu_irq_exit(void) { lockdep_assert_irqs_disabled(); rcu_nmi_exit_common(true); @@ -774,7 +775,7 @@ void rcu_irq_exit_irqson(void) * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ -static void rcu_eqs_exit(bool user) +static void noinstr rcu_eqs_exit(bool user) { struct rcu_data *rdp; long oldval; @@ -792,12 +793,14 @@ static void rcu_eqs_exit(bool user) // RCU is not watching here ... rcu_dynticks_eqs_exit(); // ... but is watching here. + instrumentation_begin(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + instrumentation_end(); } /** @@ -828,7 +831,7 @@ void rcu_idle_exit(void) * If you add or remove a call to rcu_user_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_exit(void) +void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } @@ -876,28 +879,35 @@ static __always_inline void rcu_nmi_enter_common(bool irq) rcu_cleanup_after_idle(); incby = 1; - } else if (irq && tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - // We get here only if we had already exited the extended - // quiescent state and this was an interrupt (not an NMI). - // Therefore, (1) RCU is already watching and (2) The fact - // that we are in an interrupt handler and that the rcu_node - // lock is an irq-disabled lock prevents self-deadlock. - // So we can safely recheck under the lock. - raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - // A nohz_full CPU is in the kernel and RCU - // needs a quiescent state. Turn on the tick! - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } else if (irq) { + instrumentation_begin(); + if (tick_nohz_full_cpu(rdp->cpu) && + rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && + READ_ONCE(rdp->rcu_urgent_qs) && + !READ_ONCE(rdp->rcu_forced_tick)) { + // We get here only if we had already exited the + // extended quiescent state and this was an + // interrupt (not an NMI). Therefore, (1) RCU is + // already watching and (2) The fact that we are in + // an interrupt handler and that the rcu_node lock + // is an irq-disabled lock prevents self-deadlock. + // So we can safely recheck under the lock. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU + // needs a quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } - raw_spin_unlock_rcu_node(rdp->mynode); + instrumentation_end(); } + instrumentation_begin(); trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); @@ -906,11 +916,10 @@ static __always_inline void rcu_nmi_enter_common(bool irq) /** * rcu_nmi_enter - inform RCU of entry to NMI context */ -void rcu_nmi_enter(void) +noinstr void rcu_nmi_enter(void) { rcu_nmi_enter_common(false); } -NOKPROBE_SYMBOL(rcu_nmi_enter); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -934,7 +943,7 @@ NOKPROBE_SYMBOL(rcu_nmi_enter); * If you add or remove a call to rcu_irq_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_enter(void) +noinstr void rcu_irq_enter(void) { lockdep_assert_irqs_disabled(); rcu_nmi_enter_common(true); @@ -979,7 +988,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) * if the current CPU is not in its idle loop or is in an interrupt or * NMI handler, return true. */ -bool notrace rcu_is_watching(void) +bool rcu_is_watching(void) { bool ret; @@ -1031,12 +1040,12 @@ bool rcu_lockdep_current_cpu_online(void) if (in_nmi() || !rcu_scheduler_fully_active) return true; - preempt_disable(); + preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ret = true; - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 50caa3fcbad2..352223664ebd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2539,7 +2539,7 @@ static void rcu_bind_gp_kthread(void) } /* Record the current task on dyntick-idle entry. */ -static void rcu_dynticks_task_enter(void) +static void noinstr rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); @@ -2547,7 +2547,7 @@ static void rcu_dynticks_task_enter(void) } /* Record no current task on dyntick-idle exit. */ -static void rcu_dynticks_task_exit(void) +static void noinstr rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3ce63a91d956..84843adfd939 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -284,13 +284,12 @@ struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); EXPORT_SYMBOL_GPL(rcu_callback_map); -int notrace debug_lockdep_rcu_enabled(void) +noinstr int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); -NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? -- cgit v1.2.3 From 9ea366f669ded353ae49754216c042e7d2f72ba6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Feb 2020 12:31:16 -0800 Subject: rcu: Make RCU IRQ enter/exit functions rely on in_nmi() The rcu_nmi_enter_common() and rcu_nmi_exit_common() functions take an "irq" parameter that indicates whether these functions have been invoked from an irq handler (irq==true) or an NMI handler (irq==false). However, recent changes have applied notrace to a few critical functions such that rcu_nmi_enter_common() and rcu_nmi_exit_common() many now rely on in_nmi(). Note that in_nmi() works no differently than before, but rather that tracing is now prohibited in code regions where in_nmi() would incorrectly report NMI state. Therefore remove the "irq" parameter and inline rcu_nmi_enter_common() and rcu_nmi_exit_common() into rcu_nmi_enter() and rcu_nmi_exit(), respectively. Signed-off-by: Paul E. McKenney Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.617130349@linutronix.de --- kernel/rcu/tree.c | 47 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0713ef3fa560..945401674b9d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -664,16 +664,18 @@ noinstr void rcu_user_enter(void) } #endif /* CONFIG_NO_HZ_FULL */ -/* +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * * If we are returning from the outermost NMI handler that interrupted an * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * - * If you add or remove a call to rcu_nmi_exit_common(), be sure to test + * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_exit_common(bool irq) +noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -704,7 +706,7 @@ static __always_inline void rcu_nmi_exit_common(bool irq) trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (irq) + if (!in_nmi()) rcu_prepare_for_idle(); instrumentation_end(); @@ -712,21 +714,10 @@ static __always_inline void rcu_nmi_exit_common(bool irq) rcu_dynticks_eqs_enter(); // ... but is no longer watching here. - if (irq) + if (!in_nmi()) rcu_dynticks_task_enter(); } -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_nmi_exit(void) -{ - rcu_nmi_exit_common(false); -} - /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -749,7 +740,7 @@ void noinstr rcu_nmi_exit(void) void noinstr rcu_irq_exit(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_exit_common(true); + rcu_nmi_exit(); } /* @@ -838,7 +829,7 @@ void noinstr rcu_user_exit(void) #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_nmi_enter_common - inform RCU of entry to NMI context + * rcu_nmi_enter - inform RCU of entry to NMI context * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and @@ -847,10 +838,10 @@ void noinstr rcu_user_exit(void) * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) * - * If you add or remove a call to rcu_nmi_enter_common(), be sure to test + * If you add or remove a call to rcu_nmi_enter(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_enter_common(bool irq) +noinstr void rcu_nmi_enter(void) { long incby = 2; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -868,18 +859,18 @@ static __always_inline void rcu_nmi_enter_common(bool irq) */ if (rcu_dynticks_curr_cpu_in_eqs()) { - if (irq) + if (!in_nmi()) rcu_dynticks_task_exit(); // RCU is not watching here ... rcu_dynticks_eqs_exit(); // ... but is watching here. - if (irq) + if (!in_nmi()) rcu_cleanup_after_idle(); incby = 1; - } else if (irq) { + } else if (!in_nmi()) { instrumentation_begin(); if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && @@ -913,14 +904,6 @@ static __always_inline void rcu_nmi_enter_common(bool irq) barrier(); } -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - */ -noinstr void rcu_nmi_enter(void) -{ - rcu_nmi_enter_common(false); -} - /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * @@ -946,7 +929,7 @@ noinstr void rcu_nmi_enter(void) noinstr void rcu_irq_enter(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_enter_common(true); + rcu_nmi_enter(); } /* -- cgit v1.2.3 From 8ae0ae6737ad449c8ae21e2bb01d9736f360a933 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 3 May 2020 15:08:52 +0200 Subject: rcu: Provide rcu_irq_exit_preempt() Interrupts and exceptions invoke rcu_irq_enter() on entry and need to invoke rcu_irq_exit() before they either return to the interrupted code or invoke the scheduler due to preemption. The general assumption is that RCU idle code has to have preemption disabled so that a return from interrupt cannot schedule. So the return from interrupt code invokes rcu_irq_exit() and preempt_schedule_irq(). If there is any imbalance in the rcu_irq/nmi* invocations or RCU idle code had preemption enabled then this goes unnoticed until the CPU goes idle or some other RCU check is executed. Provide rcu_irq_exit_preempt() which can be invoked from the interrupt/exception return code in case that preemption is enabled. It invokes rcu_irq_exit() and contains a few sanity checks in case that CONFIG_PROVE_RCU is enabled to catch such issues directly. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.364456424@linutronix.de --- kernel/rcu/tree.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 945401674b9d..62ee01299386 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -743,6 +743,28 @@ void noinstr rcu_irq_exit(void) rcu_nmi_exit(); } +/** + * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq + * towards in kernel preemption + * + * Same as rcu_irq_exit() but has a sanity check that scheduling is safe + * from RCU point of view. Invoked from return from interrupt before kernel + * preemption. + */ +void rcu_irq_exit_preempt(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nmi_exit(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * -- cgit v1.2.3 From b1fcf9b83c4149c63d1e0c699e85f93cbe28e211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 May 2020 09:44:43 +0200 Subject: rcu: Provide __rcu_is_watching() Same as rcu_is_watching() but without the preempt_disable/enable() pair inside the function. It is merked noinstr so it ends up in the non-instrumentable text section. This is useful for non-preemptible code especially in the low level entry section. Using rcu_is_watching() there results in a call to the preempt_schedule_notrace() thunk which triggers noinstr section warnings in objtool. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200512213810.518709291@linutronix.de --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 62ee01299386..90c8be22d57a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -985,6 +985,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } +noinstr bool __rcu_is_watching(void) +{ + return !rcu_dynticks_curr_cpu_in_eqs(); +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * -- cgit v1.2.3 From 66e9b0717102507e64f638790eaece88765cc9e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2020 14:04:34 +0100 Subject: kprobes: Prevent probes in .noinstr.text section Instrumentation is forbidden in the .noinstr.text section. Make kprobes respect this. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200505134100.179862032@linutronix.de --- kernel/kprobes.c | 18 ++++++++++++++++++ kernel/module.c | 3 +++ 2 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9eb5acf0a9f3..3f310df4a693 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2229,6 +2229,12 @@ static int __init populate_kprobe_blacklist(unsigned long *start, /* Symbols in __kprobes_text are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); + if (ret) + return ret; + + /* Symbols in noinstr section are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, + (unsigned long)__noinstr_text_end); return ret ? : arch_populate_kprobe_blacklist(); } @@ -2248,6 +2254,12 @@ static void add_module_kprobe_blacklist(struct module *mod) end = start + mod->kprobes_text_size; kprobe_add_area_blacklist(start, end); } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_add_area_blacklist(start, end); + } } static void remove_module_kprobe_blacklist(struct module *mod) @@ -2265,6 +2277,12 @@ static void remove_module_kprobe_blacklist(struct module *mod) end = start + mod->kprobes_text_size; kprobe_remove_area_blacklist(start, end); } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_remove_area_blacklist(start, end); + } } /* Module notifier call back, checking kprobes on the module */ diff --git a/kernel/module.c b/kernel/module.c index faf733789560..72ed2b3a6ee2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3150,6 +3150,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) } #endif + mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, + &mod->noinstr_text_size); + #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), -- cgit v1.2.3 From c73be61cede5882f9605a852414db559c0ebedfd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: pipe: Add general notification queue support Make it possible to have a general notification queue built on top of a standard pipe. Notifications are 'spliced' into the pipe and then read out. splice(), vmsplice() and sendfile() are forbidden on pipes used for notifications as post_one_notification() cannot take pipe->mutex. This means that notifications could be posted in between individual pipe buffers, making iov_iter_revert() difficult to effect. The way the notification queue is used is: (1) An application opens a pipe with a special flag and indicates the number of messages it wishes to be able to queue at once (this can only be set once): pipe2(fds, O_NOTIFICATION_PIPE); ioctl(fds[0], IOC_WATCH_QUEUE_SET_SIZE, queue_depth); (2) The application then uses poll() and read() as normal to extract data from the pipe. read() will return multiple notifications if the buffer is big enough, but it will not split a notification across buffers - rather it will return a short read or EMSGSIZE. Notification messages include a length in the header so that the caller can split them up. Each message has a header that describes it: struct watch_notification { __u32 type:24; __u32 subtype:8; __u32 info; }; The type indicates the source (eg. mount tree changes, superblock events, keyring changes, block layer events) and the subtype indicates the event type (eg. mount, unmount; EIO, EDQUOT; link, unlink). The info field indicates a number of things, including the entry length, an ID assigned to a watchpoint contributing to this buffer and type-specific flags. Supplementary data, such as the key ID that generated an event, can be attached in additional slots. The maximum message size is 127 bytes. Messages may not be padded or aligned, so there is no guarantee, for example, that the notification type will be on a 4-byte bounary. Signed-off-by: David Howells --- kernel/Makefile | 1 + kernel/watch_queue.c | 657 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 658 insertions(+) create mode 100644 kernel/watch_queue.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..41e7e3ae07ec 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,6 +115,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c new file mode 100644 index 000000000000..c103e34f8705 --- /dev/null +++ b/kernel/watch_queue.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Watch queue and general notification mechanism, built on pipes + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#define pr_fmt(fmt) "watchq: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_DESCRIPTION("Watch queue"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +#define WATCH_QUEUE_NOTE_SIZE 128 +#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) + +static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct watch_queue *wqueue = (struct watch_queue *)buf->private; + struct page *page; + unsigned int bit; + + /* We need to work out which note within the page this refers to, but + * the note might have been maximum size, so merely ANDing the offset + * off doesn't work. OTOH, the note must've been more than zero size. + */ + bit = buf->offset + buf->len; + if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) + bit -= WATCH_QUEUE_NOTE_SIZE; + bit /= WATCH_QUEUE_NOTE_SIZE; + + page = buf->page; + bit += page->index; + + set_bit(bit, wqueue->notes_bitmap); +} + +static int watch_queue_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return -1; /* No. */ +} + +/* New data written to a pipe may be appended to a buffer with this type. */ +static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { + .confirm = generic_pipe_buf_confirm, + .release = watch_queue_pipe_buf_release, + .steal = watch_queue_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +/* + * Post a notification to a watch queue. + */ +static bool post_one_notification(struct watch_queue *wqueue, + struct watch_notification *n) +{ + void *p; + struct pipe_inode_info *pipe = wqueue->pipe; + struct pipe_buffer *buf; + struct page *page; + unsigned int head, tail, mask, note, offset, len; + bool done = false; + + if (!pipe) + return false; + + spin_lock_irq(&pipe->rd_wait.lock); + + if (wqueue->defunct) + goto out; + + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + if (pipe_full(head, tail, pipe->ring_size)) + goto lost; + + note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); + if (note >= wqueue->nr_notes) + goto lost; + + page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; + offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; + get_page(page); + len = n->info & WATCH_INFO_LENGTH; + p = kmap_atomic(page); + memcpy(p + offset, n, len); + kunmap_atomic(p); + + buf = &pipe->bufs[head & mask]; + buf->page = page; + buf->private = (unsigned long)wqueue; + buf->ops = &watch_queue_pipe_buf_ops; + buf->offset = offset; + buf->len = len; + buf->flags = 0; + pipe->head = head + 1; + + if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { + spin_unlock_irq(&pipe->rd_wait.lock); + BUG(); + } + wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + done = true; + +out: + spin_unlock_irq(&pipe->rd_wait.lock); + if (done) + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + return done; + +lost: + goto out; +} + +/* + * Apply filter rules to a notification. + */ +static bool filter_watch_notification(const struct watch_filter *wf, + const struct watch_notification *n) +{ + const struct watch_type_filter *wt; + unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; + unsigned int st_index = n->subtype / st_bits; + unsigned int st_bit = 1U << (n->subtype % st_bits); + int i; + + if (!test_bit(n->type, wf->type_filter)) + return false; + + for (i = 0; i < wf->nr_filters; i++) { + wt = &wf->filters[i]; + if (n->type == wt->type && + (wt->subtype_filter[st_index] & st_bit) && + (n->info & wt->info_mask) == wt->info_filter) + return true; + } + + return false; /* If there is a filter, the default is to reject. */ +} + +/** + * __post_watch_notification - Post an event notification + * @wlist: The watch list to post the event to. + * @n: The notification record to post. + * @cred: The creds of the process that triggered the notification. + * @id: The ID to match on the watch. + * + * Post a notification of an event into a set of watch queues and let the users + * know. + * + * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and + * should be in units of sizeof(*n). + */ +void __post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + const struct watch_filter *wf; + struct watch_queue *wqueue; + struct watch *watch; + + if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { + WARN_ON(1); + return; + } + + rcu_read_lock(); + + hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { + if (watch->id != id) + continue; + n->info &= ~WATCH_INFO_ID; + n->info |= watch->info_id; + + wqueue = rcu_dereference(watch->queue); + wf = rcu_dereference(wqueue->filter); + if (wf && !filter_watch_notification(wf, n)) + continue; + + if (security_post_notification(watch->cred, cred, n) < 0) + continue; + + post_one_notification(wqueue, n); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL(__post_watch_notification); + +/* + * Allocate sufficient pages to preallocation for the requested number of + * notifications. + */ +long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) +{ + struct watch_queue *wqueue = pipe->watch_queue; + struct page **pages; + unsigned long *bitmap; + unsigned long user_bufs; + unsigned int bmsize; + int ret, i, nr_pages; + + if (!wqueue) + return -ENODEV; + if (wqueue->notes) + return -EBUSY; + + if (nr_notes < 1 || + nr_notes > 512) /* TODO: choose a better hard limit */ + return -EINVAL; + + nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); + nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); + + if (nr_pages > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto error; + } + + ret = pipe_resize_ring(pipe, nr_notes); + if (ret < 0) + goto error; + + pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + if (!pages) + goto error; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto error_p; + pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + } + + bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; + bmsize *= sizeof(unsigned long); + bitmap = kmalloc(bmsize, GFP_KERNEL); + if (!bitmap) + goto error_p; + + memset(bitmap, 0xff, bmsize); + wqueue->notes = pages; + wqueue->notes_bitmap = bitmap; + wqueue->nr_pages = nr_pages; + wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + return 0; + +error_p: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kfree(pages); +error: + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); + return ret; +} + +/* + * Set the filter on a watch queue. + */ +long watch_queue_set_filter(struct pipe_inode_info *pipe, + struct watch_notification_filter __user *_filter) +{ + struct watch_notification_type_filter *tf; + struct watch_notification_filter filter; + struct watch_type_filter *q; + struct watch_filter *wfilter; + struct watch_queue *wqueue = pipe->watch_queue; + int ret, nr_filter = 0, i; + + if (!wqueue) + return -ENODEV; + + if (!_filter) { + /* Remove the old filter */ + wfilter = NULL; + goto set; + } + + /* Grab the user's filter specification */ + if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) + return -EFAULT; + if (filter.nr_filters == 0 || + filter.nr_filters > 16 || + filter.__reserved != 0) + return -EINVAL; + + tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); + if (IS_ERR(tf)) + return PTR_ERR(tf); + + ret = -EINVAL; + for (i = 0; i < filter.nr_filters; i++) { + if ((tf[i].info_filter & ~tf[i].info_mask) || + tf[i].info_mask & WATCH_INFO_LENGTH) + goto err_filter; + /* Ignore any unknown types */ + if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + continue; + nr_filter++; + } + + /* Now we need to build the internal filter from only the relevant + * user-specified filters. + */ + ret = -ENOMEM; + wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); + if (!wfilter) + goto err_filter; + wfilter->nr_filters = nr_filter; + + q = wfilter->filters; + for (i = 0; i < filter.nr_filters; i++) { + if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + continue; + + q->type = tf[i].type; + q->info_filter = tf[i].info_filter; + q->info_mask = tf[i].info_mask; + q->subtype_filter[0] = tf[i].subtype_filter[0]; + __set_bit(q->type, wfilter->type_filter); + q++; + } + + kfree(tf); +set: + pipe_lock(pipe); + wfilter = rcu_replace_pointer(wqueue->filter, wfilter, + lockdep_is_held(&pipe->mutex)); + pipe_unlock(pipe); + if (wfilter) + kfree_rcu(wfilter, rcu); + return 0; + +err_filter: + kfree(tf); + return ret; +} + +static void __put_watch_queue(struct kref *kref) +{ + struct watch_queue *wqueue = + container_of(kref, struct watch_queue, usage); + struct watch_filter *wfilter; + int i; + + for (i = 0; i < wqueue->nr_pages; i++) + __free_page(wqueue->notes[i]); + + wfilter = rcu_access_pointer(wqueue->filter); + if (wfilter) + kfree_rcu(wfilter, rcu); + kfree_rcu(wqueue, rcu); +} + +/** + * put_watch_queue - Dispose of a ref on a watchqueue. + * @wqueue: The watch queue to unref. + */ +void put_watch_queue(struct watch_queue *wqueue) +{ + kref_put(&wqueue->usage, __put_watch_queue); +} +EXPORT_SYMBOL(put_watch_queue); + +static void free_watch(struct rcu_head *rcu) +{ + struct watch *watch = container_of(rcu, struct watch, rcu); + + put_watch_queue(rcu_access_pointer(watch->queue)); + put_cred(watch->cred); +} + +static void __put_watch(struct kref *kref) +{ + struct watch *watch = container_of(kref, struct watch, usage); + + call_rcu(&watch->rcu, free_watch); +} + +/* + * Discard a watch. + */ +static void put_watch(struct watch *watch) +{ + kref_put(&watch->usage, __put_watch); +} + +/** + * init_watch_queue - Initialise a watch + * @watch: The watch to initialise. + * @wqueue: The queue to assign. + * + * Initialise a watch and set the watch queue. + */ +void init_watch(struct watch *watch, struct watch_queue *wqueue) +{ + kref_init(&watch->usage); + INIT_HLIST_NODE(&watch->list_node); + INIT_HLIST_NODE(&watch->queue_node); + rcu_assign_pointer(watch->queue, wqueue); +} + +/** + * add_watch_to_object - Add a watch on an object to a watch list + * @watch: The watch to add + * @wlist: The watch list to add to + * + * @watch->queue must have been set to point to the queue to post notifications + * to and the watch list of the object to be watched. @watch->cred must also + * have been set to the appropriate credentials and a ref taken on them. + * + * The caller must pin the queue and the list both and must hold the list + * locked against racing watch additions/removals. + */ +int add_watch_to_object(struct watch *watch, struct watch_list *wlist) +{ + struct watch_queue *wqueue = rcu_access_pointer(watch->queue); + struct watch *w; + + hlist_for_each_entry(w, &wlist->watchers, list_node) { + struct watch_queue *wq = rcu_access_pointer(w->queue); + if (wqueue == wq && watch->id == w->id) + return -EBUSY; + } + + watch->cred = get_current_cred(); + rcu_assign_pointer(watch->watch_list, wlist); + + spin_lock_bh(&wqueue->lock); + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + spin_unlock_bh(&wqueue->lock); + + hlist_add_head(&watch->list_node, &wlist->watchers); + return 0; +} +EXPORT_SYMBOL(add_watch_to_object); + +/** + * remove_watch_from_object - Remove a watch or all watches from an object. + * @wlist: The watch list to remove from + * @wq: The watch queue of interest (ignored if @all is true) + * @id: The ID of the watch to remove (ignored if @all is true) + * @all: True to remove all objects + * + * Remove a specific watch or all watches from an object. A notification is + * sent to the watcher to tell them that this happened. + */ +int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, + u64 id, bool all) +{ + struct watch_notification_removal n; + struct watch_queue *wqueue; + struct watch *watch; + int ret = -EBADSLT; + + rcu_read_lock(); + +again: + spin_lock(&wlist->lock); + hlist_for_each_entry(watch, &wlist->watchers, list_node) { + if (all || + (watch->id == id && rcu_access_pointer(watch->queue) == wq)) + goto found; + } + spin_unlock(&wlist->lock); + goto out; + +found: + ret = 0; + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + spin_unlock(&wlist->lock); + + /* We now own the reference on watch that used to belong to wlist. */ + + n.watch.type = WATCH_TYPE_META; + n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; + n.watch.info = watch->info_id | watch_sizeof(n.watch); + n.id = id; + if (id != 0) + n.watch.info = watch->info_id | watch_sizeof(n); + + wqueue = rcu_dereference(watch->queue); + + /* We don't need the watch list lock for the next bit as RCU is + * protecting *wqueue from deallocation. + */ + if (wqueue) { + post_one_notification(wqueue, &n.watch); + + spin_lock_bh(&wqueue->lock); + + if (!hlist_unhashed(&watch->queue_node)) { + hlist_del_init_rcu(&watch->queue_node); + put_watch(watch); + } + + spin_unlock_bh(&wqueue->lock); + } + + if (wlist->release_watch) { + void (*release_watch)(struct watch *); + + release_watch = wlist->release_watch; + rcu_read_unlock(); + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + + if (all && !hlist_empty(&wlist->watchers)) + goto again; +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(remove_watch_from_object); + +/* + * Remove all the watches that are contributory to a queue. This has the + * potential to race with removal of the watches by the destruction of the + * objects being watched or with the distribution of notifications. + */ +void watch_queue_clear(struct watch_queue *wqueue) +{ + struct watch_list *wlist; + struct watch *watch; + bool release; + + rcu_read_lock(); + spin_lock_bh(&wqueue->lock); + + /* Prevent new additions and prevent notifications from happening */ + wqueue->defunct = true; + + while (!hlist_empty(&wqueue->watches)) { + watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); + hlist_del_init_rcu(&watch->queue_node); + /* We now own a ref on the watch. */ + spin_unlock_bh(&wqueue->lock); + + /* We can't do the next bit under the queue lock as we need to + * get the list lock - which would cause a deadlock if someone + * was removing from the opposite direction at the same time or + * posting a notification. + */ + wlist = rcu_dereference(watch->watch_list); + if (wlist) { + void (*release_watch)(struct watch *); + + spin_lock(&wlist->lock); + + release = !hlist_unhashed(&watch->list_node); + if (release) { + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + + /* We now own a second ref on the watch. */ + } + + release_watch = wlist->release_watch; + spin_unlock(&wlist->lock); + + if (release) { + if (release_watch) { + rcu_read_unlock(); + /* This might need to call dput(), so + * we have to drop all the locks. + */ + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + } + } + + put_watch(watch); + spin_lock_bh(&wqueue->lock); + } + + spin_unlock_bh(&wqueue->lock); + rcu_read_unlock(); +} + +/** + * get_watch_queue - Get a watch queue from its file descriptor. + * @fd: The fd to query. + */ +struct watch_queue *get_watch_queue(int fd) +{ + struct pipe_inode_info *pipe; + struct watch_queue *wqueue = ERR_PTR(-EINVAL); + struct fd f; + + f = fdget(fd); + if (f.file) { + pipe = get_pipe_info(f.file, false); + if (pipe && pipe->watch_queue) { + wqueue = pipe->watch_queue; + kref_get(&wqueue->usage); + } + fdput(f); + } + + return wqueue; +} +EXPORT_SYMBOL(get_watch_queue); + +/* + * Initialise a watch queue + */ +int watch_queue_init(struct pipe_inode_info *pipe) +{ + struct watch_queue *wqueue; + + wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); + if (!wqueue) + return -ENOMEM; + + wqueue->pipe = pipe; + kref_init(&wqueue->usage); + spin_lock_init(&wqueue->lock); + INIT_HLIST_HEAD(&wqueue->watches); + + pipe->watch_queue = wqueue; + return 0; +} -- cgit v1.2.3 From 8cfba76383e902acbed95092163052b1572f17a8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: pipe: Allow buffers to be marked read-whole-or-error for notifications Allow a buffer to be marked such that read() must return the entire buffer in one go or return ENOBUFS. Multiple buffers can be amalgamated into a single read, but a short read will occur if the next "whole" buffer won't fit. This is useful for watch queue notifications to make sure we don't split a notification across multiple reads, especially given that we need to fabricate an overrun record under some circumstances - and that isn't in the buffers. Signed-off-by: David Howells --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index c103e34f8705..ad64ea300f6d 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -115,7 +115,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; - buf->flags = 0; + buf->flags = PIPE_BUF_FLAG_WHOLE; pipe->head = head + 1; if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { -- cgit v1.2.3 From e7d553d69cf63aec7de0f38fed49ccbb30922e1e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:12 +0000 Subject: pipe: Add notification lossage handling Add handling for loss of notifications by having read() insert a loss-notification message after it has read the pipe buffer that was last in the ring when the loss occurred. Lossage can come about either by running out of notification descriptors or by running out of space in the pipe ring. Signed-off-by: David Howells --- kernel/watch_queue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index ad64ea300f6d..9a9699c06709 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -132,6 +132,8 @@ out: return done; lost: + buf = &pipe->bufs[(head - 1) & mask]; + buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } -- cgit v1.2.3 From ab7e9b067f3d9cbec28cfca51d341efb421b7a51 Mon Sep 17 00:00:00 2001 From: Domenico Andreoli Date: Thu, 7 May 2020 09:19:52 +0200 Subject: PM: hibernate: Incorporate concurrency handling Hibernation concurrency handling is currently delegated to user.c, where it's also used for regulating the access to the snapshot device. In the prospective of making user.c a separate configuration option, such mutual exclusion is brought into hibernate.c and made available through accessor helpers hereby introduced. Signed-off-by: Domenico Andreoli Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 20 ++++++++++++++++---- kernel/power/power.h | 4 ++-- kernel/power/user.c | 10 ++++------ 3 files changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30bd28d1d418..02ec716a4927 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -67,6 +67,18 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +static atomic_t hibernate_atomic = ATOMIC_INIT(1); + +bool hibernate_acquire(void) +{ + return atomic_add_unless(&hibernate_atomic, -1, 0); +} + +void hibernate_release(void) +{ + atomic_inc(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); @@ -704,7 +716,7 @@ int hibernate(void) lock_system_sleep(); /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } @@ -775,7 +787,7 @@ int hibernate(void) Exit: __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); - atomic_inc(&snapshot_device_available); + hibernate_release(); Unlock: unlock_system_sleep(); pr_info("hibernation exit\n"); @@ -880,7 +892,7 @@ static int software_resume(void) goto Unlock; /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; swsusp_close(FMODE_READ); goto Unlock; @@ -911,7 +923,7 @@ static int software_resume(void) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); pr_info("resume failed (%d)\n", error); - atomic_inc(&snapshot_device_available); + hibernate_release(); /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&system_transition_mutex); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7cdc64dc2373..ba2094db6294 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -154,8 +154,8 @@ extern int snapshot_write_next(struct snapshot_handle *handle); extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -/* If unset, the snapshot device cannot be open. */ -extern atomic_t snapshot_device_available; +extern bool hibernate_acquire(void); +extern void hibernate_release(void); extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); diff --git a/kernel/power/user.c b/kernel/power/user.c index 7959449765d9..98548d1cf8a6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -37,8 +37,6 @@ static struct snapshot_data { bool free_bitmaps; } snapshot_state; -atomic_t snapshot_device_available = ATOMIC_INIT(1); - static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; @@ -49,13 +47,13 @@ static int snapshot_open(struct inode *inode, struct file *filp) lock_system_sleep(); - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); + hibernate_release(); error = -ENOSYS; goto Unlock; } @@ -92,7 +90,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) - atomic_inc(&snapshot_device_available); + hibernate_release(); data->frozen = false; data->ready = false; @@ -122,7 +120,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); - atomic_inc(&snapshot_device_available); + hibernate_release(); unlock_system_sleep(); -- cgit v1.2.3 From c4f39a6c74389fcc93ac39056ef342f32ab57a23 Mon Sep 17 00:00:00 2001 From: Domenico Andreoli Date: Thu, 7 May 2020 09:19:53 +0200 Subject: PM: hibernate: Split off snapshot dev option Make it possible to reduce the attack surface in case the snapshot device is not to be used from userspace. Signed-off-by: Domenico Andreoli Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 12 ++++++++++++ kernel/power/Makefile | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index c208566c844b..4d0e6e815a2b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -80,6 +80,18 @@ config HIBERNATION For more information take a look at . +config HIBERNATION_SNAPSHOT_DEV + bool "Userspace snapshot device" + depends on HIBERNATION + default y + ---help--- + Device used by the uswsusp tools. + + Say N if no snapshotting from userspace is needed, this also + reduces the attack surface of the kernel. + + If in doubt, say Y. + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/Makefile b/kernel/power/Makefile index e7e47d9be1e5..5899260a8bef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o +obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o -- cgit v1.2.3 From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:45 +0200 Subject: bpf: Add get{peer, sock}name attach types for sock_addr As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be transparent to applications. In Cilium we make use of these hooks [0] in order to enable E-W load balancing for existing Kubernetes service types for all Cilium managed nodes in the cluster. Those backends can be local or remote. The main advantage of this approach is that it operates as close as possible to the socket, and therefore allows to avoid packet-based NAT given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses. This also allows to expose NodePort services on loopback addresses in the host namespace, for example. As another advantage, this also efficiently blocks bind requests for applications in the host namespace for exposed ports. However, one missing item is that we also need to perform reverse xlation for inet{,6}_getname() hooks such that we can return the service IP/port tuple back to the application instead of the remote peer address. The vast majority of applications does not bother about getpeername(), but in a few occasions we've seen breakage when validating the peer's address since it returns unexpectedly the backend tuple instead of the service one. Therefore, this trivial patch allows to customise and adds a getpeername() as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order to address this situation. Simple example: # ./cilium/cilium service list ID Frontend Service Type Backend 1 1.2.3.4:80 ClusterIP 1 => 10.0.0.10:80 Before; curl's verbose output example, no getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] After; with getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed peer to the context similar as in inet{,6}_getname() fashion, but API-wise this is suboptimal as it always enforces programs having to test for ctx->peer which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split. Similarly, the checked return code is on tnum_range(1, 1), but if a use case comes up in future, it can easily be changed to return an error code instead. Helper and ctx member access is the same as with connect/sendmsg/etc hooks. [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net --- kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 6 +++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c7d67d65d8c..2ed8351f47a4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: -- cgit v1.2.3 From b34cb07dde7c2346dec73d053ce926aeaa087303 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Tue, 12 May 2020 09:52:22 -0400 Subject: sched/fair: Fix enqueue_task_fair() warning some more sched/fair: Fix enqueue_task_fair warning some more The recent patch, fe61468b2cb (sched/fair: Fix enqueue_task_fair warning) did not fully resolve the issues with the rq->tmp_alone_branch != &rq->leaf_cfs_rq_list warning in enqueue_task_fair. There is a case where the first for_each_sched_entity loop exits due to on_rq, having incompletely updated the list. In this case the second for_each_sched_entity loop can further modify se. The later code to fix up the list management fails to do what is needed because se does not point to the sched_entity which broke out of the first loop. The list is not fixed up because the throttled parent was already added back to the list by a task enqueue in a parallel child hierarchy. Address this by calling list_add_leaf_cfs_rq if there are throttled parents while doing the second for_each_sched_entity loop. Fixes: fe61468b2cb ("sched/fair: Fix enqueue_task_fair warning") Suggested-by: Vincent Guittot Signed-off-by: Phil Auld Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200512135222.GC2201@lorien.usersys.redhat.com --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..c6d57c334d51 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5479,6 +5479,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; + + /* + * One parent has been throttled and cfs_rq removed from the + * list. Add it back to not break the leaf list. + */ + if (throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } enqueue_throttle: -- cgit v1.2.3 From ad32bb41fca67936c0c1d6d0bdd6d3e2e9c5432f Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Sun, 10 May 2020 18:26:41 +0530 Subject: sched/debug: Fix requested task uclamp values shown in procfs The intention of commit 96e74ebf8d59 ("sched/debug: Add task uclamp values to SCHED_DEBUG procfs") was to print requested and effective task uclamp values. The requested values printed are read from p->uclamp, which holds the last effective values. Fix this by printing the values from p->uclamp_req. Fixes: 96e74ebf8d59 ("sched/debug: Add task uclamp values to SCHED_DEBUG procfs") Signed-off-by: Pavankumar Kondeti Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/1589115401-26391-1-git-send-email-pkondeti@codeaurora.org --- kernel/sched/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a562df57a86e..239970b991c0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -948,8 +948,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_est.enqueued); #endif #ifdef CONFIG_UCLAMP_TASK - __PS("uclamp.min", p->uclamp[UCLAMP_MIN].value); - __PS("uclamp.max", p->uclamp[UCLAMP_MAX].value); + __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); + __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); #endif -- cgit v1.2.3 From 39f23ce07b9355d05a64ae303ce20d1c4b92b957 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 13 May 2020 15:55:28 +0200 Subject: sched/fair: Fix unthrottle_cfs_rq() for leaf_cfs_rq list Although not exactly identical, unthrottle_cfs_rq() and enqueue_task_fair() are quite close and follow the same sequence for enqueuing an entity in the cfs hierarchy. Modify unthrottle_cfs_rq() to use the same pattern as enqueue_task_fair(). This fixes a problem already faced with the latter and add an optimization in the last for_each_sched_entity loop. Fixes: fe61468b2cb (sched/fair: Fix enqueue_task_fair warning) Reported-by Tao Zhou Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Ben Segall Link: https://lkml.kernel.org/r/20200513135528.4742-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c6d57c334d51..538ba5d94e99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4774,7 +4774,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - int enqueue = 1; long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4798,26 +4797,44 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) - enqueue = 0; + break; + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (enqueue) { - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); - } else { - update_load_avg(cfs_rq, se, 0); - se_update_runnable(se); - } + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto unthrottle_throttle; + + /* + * One parent has been throttled and cfs_rq removed from the + * list. Add it back to not break the leaf list. + */ + if (throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } - if (!se) - add_nr_running(rq, task_delta); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, task_delta); +unthrottle_throttle: /* * The cfs_rq_throttled() breaks in the above iteration can result in * incomplete leaf list maintenance, resulting in triggering the @@ -4826,7 +4843,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - list_add_leaf_cfs_rq(cfs_rq); + if (list_add_leaf_cfs_rq(cfs_rq)) + break; } assert_list_leaf_cfs_rq(rq); -- cgit v1.2.3 From 7d148be69e3a0eaa9d029a3c51b545e322116a99 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 13 May 2020 15:55:02 +0200 Subject: sched/fair: Optimize enqueue_task_fair() enqueue_task_fair jumps to enqueue_throttle label when cfs_rq_of(se) is throttled which means that se can't be NULL in such case and we can move the label after the if (!se) statement. Futhermore, the latter can be removed because se is always NULL when reaching this point. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200513135502.4672-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a58874ef104..4e586863827b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5512,28 +5512,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) list_add_leaf_cfs_rq(cfs_rq); } -enqueue_throttle: - if (!se) { - add_nr_running(rq, 1); - /* - * Since new tasks are assigned an initial util_avg equal to - * half of the spare capacity of their CPU, tiny tasks have the - * ability to cross the overutilized threshold, which will - * result in the load balancer ruining all the task placement - * done by EAS. As a way to mitigate that effect, do not account - * for the first enqueue operation of new tasks during the - * overutilized flag detection. - * - * A better way of solving this problem would be to wait for - * the PELT signals of tasks to converge before taking them - * into account, but that is not straightforward to implement, - * and the following generally works well enough in practice. - */ - if (flags & ENQUEUE_WAKEUP) - update_overutilized_status(rq); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); - } + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (flags & ENQUEUE_WAKEUP) + update_overutilized_status(rq); +enqueue_throttle: if (cfs_bandwidth_used()) { /* * When bandwidth control is enabled; the cfs_rq_throttled() -- cgit v1.2.3 From 12aa2587388de6697fd2e585ae6a90f70249540b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 7 May 2020 11:10:39 +0800 Subject: sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr() The cpuacct_charge() and cpuacct_account_field() are called with rq->lock held, and this means preemption(and IRQs) are indeed disabled, so it is safe to use __this_cpu_*() to allow for better code-generation. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200507031039.32615-1-songmuchun@bytedance.com --- kernel/sched/cpuacct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9fbb10383434..6448b0438ffb 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -347,7 +347,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + __this_cpu_add(ca->cpuusage->usages[index], cputime); rcu_read_unlock(); } @@ -363,7 +363,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpustat)->cpustat[index] += val; + __this_cpu_add(ca->cpustat->cpustat[index], val); rcu_read_unlock(); } -- cgit v1.2.3 From 95d685935a2edf209fc68f52494ede4a382a6c2b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 6 May 2020 17:53:01 +0200 Subject: sched/pelt: Sync util/runnable_sum with PELT window when propagating update_tg_cfs_*() propagate the impact of the attach/detach of an entity down into the cfs_rq hierarchy and must keep the sync with the current pelt window. Even if we can't sync child cfs_rq and its group se, we can sync the group se and its parent cfs_rq with current position in the PELT window. In fact, we must keep them sync in order to stay also synced with others entities and group entities that are already attached to the cfs_rq. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200506155301.14288-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 49 +++++++++++++++++++++++++++---------------------- kernel/sched/pelt.c | 24 ++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e586863827b..44b0c8edc260 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3441,52 +3441,46 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* Nothing to update */ if (!delta) return; - /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. - */ - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + se->avg.util_sum = se->avg.util_avg * divider; /* Update parent cfs_rq utilization */ add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* Nothing to update */ if (!delta) return; - /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. - */ - /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + se->avg.runnable_sum = se->avg.runnable_avg * divider; /* Update parent cfs_rq runnable */ add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; } static inline void @@ -3496,19 +3490,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq unsigned long load_avg; u64 load_sum = 0; s64 delta_sum; + u32 divider; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + if (runnable_sum >= 0) { /* * Add runnable; clip at LOAD_AVG_MAX. Reflects that until * the CPU is saturated running == runnable. */ runnable_sum += se->avg.load_sum; - runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + runnable_sum = min_t(long, runnable_sum, divider); } else { /* * Estimate the new unweighted runnable_sum of the gcfs_rq by @@ -3533,7 +3534,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, LOAD_AVG_MAX); + load_avg = div_s64(load_sum, divider); delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; @@ -3697,6 +3698,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b647d04d9c8b..b4b1ff96642f 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa, return 1; } +/* + * When syncing *_avg with *_sum, we must take into account the current + * position in the PELT segment otherwise the remaining part of the segment + * will be considered as idle time whereas it's not yet elapsed and this will + * generate unwanted oscillation in the range [1002..1024[. + * + * The max value of *_sum varies with the position in the time segment and is + * equals to : + * + * LOAD_AVG_MAX*y + sa->period_contrib + * + * which can be simplified into: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024 + * + * The same care must be taken when a sched entity is added, updated or + * removed from a cfs_rq and we need to update sched_avg. Scheduler entities + * and the cfs rq, to which they are attached, have the same position in the + * time segment because they use the same clock. This means that we can use + * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity + * if it's more convenient. + */ static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { -- cgit v1.2.3 From 04f5c362ec6d3ff0e14f1c05230b550da7f528a4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:21:41 -0500 Subject: sched/fair: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200507192141.GA16183@embeddedor --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44b0c8edc260..01f94cf52783 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1094,7 +1094,7 @@ struct numa_group { * more by CPU use than by memory faults. */ unsigned long *faults_cpu; - unsigned long faults[0]; + unsigned long faults[]; }; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 21416b30c520..2bd2a222318a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1462,7 +1462,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) -- cgit v1.2.3 From dbe9337109c2705f08e6a00392f991eb2d2570a5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 20 Apr 2020 15:04:53 +0800 Subject: sched/cpuacct: Fix charge cpuacct.usage_sys The user_mode(task_pt_regs(tsk)) always return true for user thread, and false for kernel thread. So it means that the cpuacct.usage_sys is the time that kernel thread uses not the time that thread uses in the kernel mode. We can try get_irq_regs() first, if it is NULL, then we can fall back to task_pt_regs(). Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200420070453.76815-1-songmuchun@bytedance.com --- kernel/sched/cpuacct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 6448b0438ffb..941c28cf9738 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -5,6 +5,7 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include #include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ @@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = task_pt_regs(tsk); + struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); if (regs && user_mode(regs)) index = CPUACCT_STAT_USER; -- cgit v1.2.3 From d505b8af58912ae1e1a211fabc9995b19bd40828 Mon Sep 17 00:00:00 2001 From: Huaixin Chang Date: Sat, 25 Apr 2020 18:52:48 +0800 Subject: sched: Defend cfs and rt bandwidth quota against overflow When users write some huge number into cpu.cfs_quota_us or cpu.rt_runtime_us, overflow might happen during to_ratio() shifts of schedulable checks. to_ratio() could be altered to avoid unnecessary internal overflow, but min_cfs_quota_period is less than 1 << BW_SHIFT, so a cutoff would still be needed. Set a cap MAX_BW for cfs_quota_us and rt_runtime_us to prevent overflow. Signed-off-by: Huaixin Chang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Link: https://lkml.kernel.org/r/20200425105248.60093-1-changhuaixin@linux.alibaba.com --- kernel/sched/core.c | 8 ++++++++ kernel/sched/rt.c | 12 +++++++++++- kernel/sched/sched.h | 2 ++ 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 74fb89b5ce3e..fa905b6daafa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7379,6 +7379,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -7406,6 +7408,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota != RUNTIME_INF && quota > max_cfs_runtime) + return -EINVAL; + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..6d60ba21ed29 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -9,6 +9,8 @@ int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +/* More than 4 hours if BW_SHIFT equals 20. */ +static const u64 max_rt_runtime = MAX_BW; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg, if (rt_period == 0) return -EINVAL; + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) @@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void) return -EINVAL; if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || + ((u64)sysctl_sched_rt_runtime * + NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; return 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2bd2a222318a..f7ab6334e992 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1915,6 +1915,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -- cgit v1.2.3 From c50c75e9b87946499a62bffc021e95c87a1d57cd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 11 May 2020 15:12:27 -0500 Subject: perf/core: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200511201227.GA14041@embeddedor --- kernel/events/callchain.c | 2 +- kernel/events/internal.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..b1991043b7d8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -16,7 +16,7 @@ struct callchain_cpus_entries { struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; + struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index f16f66b6b655..fcbf5616a441 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -55,7 +55,7 @@ struct perf_buffer { void *aux_priv; struct perf_event_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); -- cgit v1.2.3 From db78538c75e49c09b002a2cd96a19ae0c39be771 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:58:04 -0500 Subject: locking/lockdep: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200507185804.GA15036@embeddedor --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..cfdff122905b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -489,7 +489,7 @@ struct lock_trace { struct hlist_node hash_entry; u32 hash; u32 nr_entries; - unsigned long entries[0] __aligned(sizeof(unsigned long)); + unsigned long entries[] __aligned(sizeof(unsigned long)); }; #define LOCK_TRACE_SIZE_IN_LONGS \ (sizeof(struct lock_trace) / sizeof(unsigned long)) -- cgit v1.2.3 From c143b7753b308d5f7d03b165a7fdff1dda1f271d Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 15 May 2020 10:08:28 +0000 Subject: ftrace: show debugging information when panic_on_warn set When an anomaly is detected in the function call modification code, ftrace_bug() is called to disable function tracing as well as give some warn and information that may help debug the problem. But currently, we call FTRACE_WARN_ON_ONCE() first in ftrace_bug(), so when panic_on_warn is set, we can't see the debugging information here. Call FTRACE_WARN_ON_ONCE() at the end of ftrace_bug() to ensure that the debugging information is displayed first. after this patch, the dmesg looks like: ------------[ ftrace bug ]------------ ftrace failed to modify [] bcm2835_handle_irq+0x4/0x58 actual: 1f:20:03:d5 Setting ftrace call site to call ftrace function ftrace record flags: 80000001 (1) expected tramp: ffff80001009d6f0 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1635 at kernel/trace/ftrace.c:2078 ftrace_bug+0x204/0x238 Kernel panic - not syncing: panic_on_warn set ... CPU: 2 PID: 1635 Comm: sh Not tainted 5.7.0-rc5-00033-gb922183867f5 #14 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1b0 show_stack+0x20/0x30 dump_stack+0xc0/0x10c panic+0x16c/0x368 __warn+0x120/0x160 report_bug+0xc8/0x160 bug_handler+0x28/0x98 brk_handler+0x70/0xd0 do_debug_exception+0xcc/0x1ac el1_sync_handler+0xe4/0x120 el1_sync+0x7c/0x100 ftrace_bug+0x204/0x238 Link: https://lkml.kernel.org/r/20200515100828.7091-1-cj.chengjian@huawei.com Signed-off-by: Cheng Jian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bd030b1b9514..cd39cbf3631a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2027,14 +2027,14 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) { unsigned long ip = rec ? rec->ip : 0; + pr_info("------------[ ftrace bug ]------------\n"); + switch (failed) { case -EFAULT: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); print_ip_sym(ip); break; case -EINVAL: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); print_ip_ins(" actual: ", (unsigned char *)ip); @@ -2045,12 +2045,10 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) } break; case -EPERM: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); print_ip_sym(ip); break; default: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } @@ -2077,6 +2075,8 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) ip = ftrace_get_addr_curr(rec); pr_cont("\n expected tramp: %lx\n", ip); } + + FTRACE_WARN_ON_ONCE(1); } static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) -- cgit v1.2.3 From fc9d276f22330e9322a9c592c71e0571810205f7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 7 May 2020 21:30:08 +0200 Subject: tracing/probe: reverse arguments to list_add Elsewhere in the file, the function trace_kprobe_has_same_kprobe uses a trace_probe_event.probes object as the second argument of list_for_each_entry, ie as a list head, while the list_for_each_entry iterates over the list fields of the trace_probe structures, making them the list elements. So, exchange the arguments on the list_add call to put the list head in the second argument. Since both list_head structures were just initialized, this problem did not cause any loss of information. Link: https://lkml.kernel.org/r/1588879808-24488-1-git-send-email-Julia.Lawall@inria.fr Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Acked-by: Masami Hiramatsu Signed-off-by: Julia Lawall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ab8b6436d53f..b8a928e925c7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1006,7 +1006,7 @@ int trace_probe_init(struct trace_probe *tp, const char *event, INIT_LIST_HEAD(&tp->event->class.fields); INIT_LIST_HEAD(&tp->event->probes); INIT_LIST_HEAD(&tp->list); - list_add(&tp->event->probes, &tp->list); + list_add(&tp->list, &tp->event->probes); call = trace_probe_event_call(tp); call->class = &tp->event->class; -- cgit v1.2.3 From 6797d97ab9d1b0ef94bf6063920669409dc2d730 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:13 +0200 Subject: trace: remove tracing_pipe_buf_ops tracing_pipe_buf_ops has identical ops to default_pipe_buf_ops, so use that instead. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/trace/trace.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..bc9783797d27 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6304,13 +6304,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, __free_page(spd->pages[idx]); } -static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, - .release = generic_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { @@ -6372,7 +6365,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, - .ops = &tracing_pipe_buf_ops, + .ops = &default_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; ssize_t ret; -- cgit v1.2.3 From 76887c256744740d6121af9bc4aa787712a1f694 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:14 +0200 Subject: fs: make the pipe_buf_operations ->steal operation optional Just return 1 for failure if it is not present. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc9783797d27..29fa25cfb6c2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7576,7 +7576,6 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, static const struct pipe_buf_operations buffer_pipe_buf_ops = { .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; -- cgit v1.2.3 From b8d9e7f2411b0744df2ec33e80d7698180fef21a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:15 +0200 Subject: fs: make the pipe_buf_operations ->confirm operation optional Just return 0 for success if it is not present. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/relay.c | 1 - kernel/trace/trace.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..c5ece4c2b04d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,7 +1177,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29fa25cfb6c2..63d1ab978435 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7574,7 +7574,6 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .get = buffer_pipe_buf_get, }; -- cgit v1.2.3 From c928f642c29a5ffb02e16f2430b42b876dde69de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:16 +0200 Subject: fs: rename pipe_buf ->steal to ->try_steal And replace the arcane return value convention with a simple bool where true means success and false means failure. [AV: braino fix folded in] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/relay.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index c5ece4c2b04d..5c2263f3f857 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,9 +1177,9 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .release = relay_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, + .release = relay_pipe_buf_release, + .try_steal = generic_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -- cgit v1.2.3 From 87b047d2be417b271d80f5e490a825c6fd53ecad Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 16 Mar 2020 12:21:12 -0500 Subject: exec: Teach prepare_exec_creds how exec treats uids & gids It is almost possible to use the result of prepare_exec_creds with no modifications during exec. Update prepare_exec_creds to initialize the suid and the fsuid to the euid, and the sgid and the fsgid to the egid. This is all that is needed to handle the common case of exec when nothing special like a setuid exec is happening. That this preserves the existing behavior of exec can be verified by examing bprm_fill_uid and cap_bprm_set_creds. This change makes it clear that the later parts of exec that update bprm->cred are just need to handle special cases such as setuid exec and change of domains. Link: https://lkml.kernel.org/r/871rng22dm.fsf_-_@x220.int.ebiederm.org Acked-by: Linus Torvalds Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/cred.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 71a792616917..421b1149c651 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -315,6 +315,9 @@ struct cred *prepare_exec_creds(void) new->process_keyring = NULL; #endif + new->suid = new->fsuid = new->euid; + new->sgid = new->fsgid = new->egid; + return new; } -- cgit v1.2.3 From 9d44a121c5a79bc8a9d67c058456bd52a83c79e7 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 20 May 2020 14:47:13 -0400 Subject: audit: add subj creds to NETFILTER_CFG record to Some table unregister actions seem to be initiated by the kernel to garbage collect unused tables that are not initiated by any userspace actions. It was found to be necessary to add the subject credentials to cover this case to reveal the source of these actions. A sample record: The uid, auid, tty, ses and exe fields have not been included since they are in the SYSCALL record and contain nothing useful in the non-user context. Here are two sample orphaned records: type=NETFILTER_CFG msg=audit(2020-05-20 12:14:36.505:5) : table=filter family=ipv4 entries=0 op=register pid=1 subj=kernel comm=swapper/0 type=NETFILTER_CFG msg=audit(2020-05-20 12:15:27.701:301) : table=nat family=bridge entries=0 op=unregister pid=30 subj=system_u:system_r:kernel_t:s0 comm=kworker/u4:1 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cfe3486e5f31..468a23390457 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2557,12 +2557,18 @@ void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op) { struct audit_buffer *ab; + char comm[sizeof(current->comm)]; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); if (!ab) return; audit_log_format(ab, "table=%s family=%u entries=%u op=%s", name, af, nentries, audit_nfcfgs[op].s); + + audit_log_format(ab, " pid=%u", task_pid_nr(current)); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_end(ab); } EXPORT_SYMBOL_GPL(__audit_log_nfcfg); -- cgit v1.2.3 From dfeb376dd4cb2c5004aeb625e2475f58a5ff2ea7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 May 2020 22:38:24 -0700 Subject: bpf: Prevent mmap()'ing read-only maps as writable As discussed in [0], it's dangerous to allow mapping BPF map, that's meant to be frozen and is read-only on BPF program side, because that allows user-space to actually store a writable view to the page even after it is frozen. This is exacerbated by BPF verifier making a strong assumption that contents of such frozen map will remain unchanged. To prevent this, disallow mapping BPF_F_RDONLY_PROG mmap()'able BPF maps as writable, ever. [0] https://lore.kernel.org/bpf/CAEf4BzYGWYhXdp6BJ7_=9OQPJxQpgug080MMjdSB72i9R+5c6g@mail.gmail.com/ Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Suggested-by: Jann Horn Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Jann Horn Link: https://lore.kernel.org/bpf/20200519053824.1089415-1-andriin@fb.com --- kernel/bpf/syscall.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2843bbba9ca1..4e6dee19a668 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -623,9 +623,20 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) mutex_lock(&map->freeze_mutex); - if ((vma->vm_flags & VM_WRITE) && map->frozen) { - err = -EPERM; - goto out; + if (vma->vm_flags & VM_WRITE) { + if (map->frozen) { + err = -EPERM; + goto out; + } + /* map is meant to be read-only, so do not allow mapping as + * writable, because it's possible to leak a writable page + * reference and allows user-space to still modify it after + * freezing, while verifier will assume contents do not change + */ + if (map->map_flags & BPF_F_RDONLY_PROG) { + err = -EACCES; + goto out; + } } /* set default open/close callbacks */ -- cgit v1.2.3 From 181e9d4efaf6aa8d1e7d510aeb7114c0f276fad7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 May 2020 19:49:25 +0300 Subject: irqdomain: Make __irq_domain_add() less OF-dependent __irq_domain_add() relies in some places on the fact that the fwnode can be only of type OF. This prevents refactoring of the code to support other types of fwnode. Make it less OF-dependent by switching it to use the fwnode directly where it makes sense. Signed-off-by: Andy Shevchenko Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200520164927.39090-1-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e2aa128ea3ee..7649f3848ab5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -132,14 +132,13 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { - struct device_node *of_node = to_of_node(fwnode); struct irqchip_fwid *fwid; struct irq_domain *domain; static atomic_t unknown_domains; domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), - GFP_KERNEL, of_node_to_nid(of_node)); + GFP_KERNEL, of_node_to_nid(to_of_node(fwnode))); if (!domain) return NULL; @@ -177,15 +176,15 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->fwnode = fwnode; #endif - } else if (of_node) { + } else if (is_of_node(fwnode)) { char *name; /* - * DT paths contain '/', which debugfs is legitimately + * fwnode paths contain '/', which debugfs is legitimately * unhappy about. Replace them with ':', which does * the trick and is not as offensive as '\'... */ - name = kasprintf(GFP_KERNEL, "%pOF", of_node); + name = kasprintf(GFP_KERNEL, "%pfw", fwnode); if (!name) { kfree(domain); return NULL; @@ -210,7 +209,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; } - of_node_get(of_node); + fwnode_handle_get(fwnode); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); @@ -259,7 +258,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - of_node_put(irq_domain_get_of_node(domain)); + fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); kfree(domain); -- cgit v1.2.3 From 87526603c89256e18ad2c23821fdaf376b072fc8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 May 2020 19:49:26 +0300 Subject: irqdomain: Get rid of special treatment for ACPI in __irq_domain_add() Now that __irq_domain_add() is able to better deals with generic fwnodes, there is no need to special-case ACPI anymore. Get rid of the special treatment for ACPI. Signed-off-by: Andy Shevchenko Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200520164927.39090-2-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 7649f3848ab5..5d14d910ab64 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -161,22 +161,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } -#ifdef CONFIG_ACPI - } else if (is_acpi_device_node(fwnode)) { - struct acpi_buffer buf = { - .length = ACPI_ALLOCATE_BUFFER, - }; - acpi_handle handle; - - handle = acpi_device_handle(to_acpi_device_node(fwnode)); - if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { - domain->name = buf.pointer; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } - - domain->fwnode = fwnode; -#endif - } else if (is_of_node(fwnode)) { + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode)) { char *name; /* -- cgit v1.2.3 From 9ed78b05f998050784ae863bd5ba4aea2e2141ed Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 May 2020 19:49:27 +0300 Subject: irqdomain: Allow software nodes for IRQ domain creation In some cases we need to have an IRQ domain created out of software node. One of such cases is DesignWare GPIO driver when it's instantiated from half-baked ACPI table (alas, we can't fix it for devices which are few years on market) and thus using software nodes to quirk this. But the driver is using IRQ domains based on per GPIO port firmware nodes, which are in the above case software ones. This brings a warning message to be printed [ 73.957183] irq: Invalid fwnode type for irqdomain and creates an anonymous IRQ domain without a debugfs entry. Allowing software nodes to be valid for IRQ domains rids us of the warning and debugs gets correctly populated. % ls -1 /sys/kernel/debug/irq/domains/ ... intel-quark-dw-apb-gpio:portA Signed-off-by: Andy Shevchenko [maz: refactored commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200520164927.39090-3-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5d14d910ab64..a4c2c915511d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -161,7 +161,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } - } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode)) { + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || + is_software_node(fwnode)) { char *name; /* -- cgit v1.2.3 From 325606af573152e02f44d791f152b7f9564bcb30 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Sat, 18 Apr 2020 19:35:36 +0800 Subject: printk: Fix a typo in comment "interator"->"iterator" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Cc: Steven Rostedt Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..525038745a14 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3360,7 +3360,7 @@ out: EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** - * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and @@ -3378,7 +3378,7 @@ void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) } /** - * kmsg_dump_rewind - reset the interator + * kmsg_dump_rewind - reset the iterator * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and -- cgit v1.2.3 From 8ece3b3eb576a78d2e67ad4c3a80a39fa6708809 Mon Sep 17 00:00:00 2001 From: Bruno Meneguele Date: Tue, 17 Mar 2020 07:33:44 -0300 Subject: kernel/printk: add kmsg SEEK_CUR handling Userspace libraries, e.g. glibc's dprintf(), perform a SEEK_CUR operation over any file descriptor requested to make sure the current position isn't pointing to junk due to previous manipulation of that same fd. And whenever that fd doesn't have support for such operation, the userspace code expects -ESPIPE to be returned. However, when the fd in question references the /dev/kmsg interface, the current kernel code state returns -EINVAL instead, causing an unexpected behavior in userspace: in the case of glibc, when -ESPIPE is returned it gets ignored and the call completes successfully, while returning -EINVAL forces dprintf to fail without performing any action over that fd: if (_IO_SEEKOFF (fp, (off64_t)0, _IO_seek_cur, _IOS_INPUT|_IOS_OUTPUT) == _IO_pos_BAD && errno != ESPIPE) return NULL; With this patch we make sure to return the correct value when SEEK_CUR is requested over kmsg and also add some kernel doc information to formalize this behavior. Link: https://lore.kernel.org/r/20200317103344.574277-1-bmeneg@redhat.com Cc: linux-kernel@vger.kernel.org Cc: rostedt@goodmis.org, Cc: David.Laight@ACULAB.COM Signed-off-by: Bruno Meneguele Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 525038745a14..35cc5f548860 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -974,6 +974,16 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) user->idx = log_next_idx; user->seq = log_next_seq; break; + case SEEK_CUR: + /* + * It isn't supported due to the record nature of this + * interface: _SET _DATA and _END point to very specific + * record positions, while _CUR would be more useful in case + * of a byte-based log. Because of that, return the default + * errno value for invalid seek operation. + */ + ret = -ESPIPE; + break; default: ret = -EINVAL; } -- cgit v1.2.3 From d20a1676df7e4c3c23d73299159811a50e4854bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 20 May 2020 21:20:50 +0200 Subject: xsk: Move xskmap.c to net/xdp/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP is partly implemented by net/xdp/xsk.c. Move xskmap.c from kernel/bpf/ to net/xdp/, which is the logical place for AF_XDP related code. Also, move AF_XDP struct definitions, and function declarations only used by AF_XDP internals into net/xdp/xsk.h. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-3-bjorn.topel@gmail.com --- kernel/bpf/Makefile | 3 - kernel/bpf/xskmap.c | 265 ---------------------------------------------------- 2 files changed, 268 deletions(-) delete mode 100644 kernel/bpf/xskmap.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 37b2d8620153..375b933010dd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -ifeq ($(CONFIG_XDP_SOCKETS),y) -obj-$(CONFIG_BPF_SYSCALL) += xskmap.o -endif obj-$(CONFIG_BPF_SYSCALL) += offload.o endif ifeq ($(CONFIG_PERF_EVENTS),y) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c deleted file mode 100644 index 2cc5c8f4c800..000000000000 --- a/kernel/bpf/xskmap.c +++ /dev/null @@ -1,265 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* XSKMAP used for AF_XDP sockets - * Copyright(c) 2018 Intel Corporation. - */ - -#include -#include -#include -#include -#include - -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - -static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *node; - int err; - - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); - if (!node) - return ERR_PTR(-ENOMEM); - - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } - - node->map = map; - node->map_entry = map_entry; - return node; -} - -static void xsk_map_node_free(struct xsk_map_node *node) -{ - xsk_map_put(node->map); - kfree(node); -} - -static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) -{ - spin_lock_bh(&xs->map_list_lock); - list_add_tail(&node->node, &xs->map_list); - spin_unlock_bh(&xs->map_list_lock); -} - -static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *n, *tmp; - - spin_lock_bh(&xs->map_list_lock); - list_for_each_entry_safe(n, tmp, &xs->map_list, node) { - if (map_entry == n->map_entry) { - list_del(&n->node); - xsk_map_node_free(n); - } - } - spin_unlock_bh(&xs->map_list_lock); -} - -static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) -{ - struct bpf_map_memory mem; - int err, numa_node; - struct xsk_map *m; - u64 size; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || - attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) - return ERR_PTR(-EINVAL); - - numa_node = bpf_map_attr_numa_node(attr); - size = struct_size(m, xsk_map, attr->max_entries); - - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); - return ERR_PTR(-ENOMEM); - } - - bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); - spin_lock_init(&m->lock); - - return &m->map; -} - -static void xsk_map_free(struct bpf_map *map) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - - bpf_clear_redirect_map(map); - synchronize_net(); - bpf_map_area_free(m); -} - -static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = next_key; - - if (index >= m->map.max_entries) { - *next = 0; - return 0; - } - - if (index == m->map.max_entries - 1) - return -ENOENT; - *next = index + 1; - return 0; -} - -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) -{ - const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; - struct bpf_insn *insn = insn_buf; - - *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); - *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); - *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); - *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); - *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *insn++ = BPF_MOV64_IMM(ret, 0); - return insn - insn_buf; -} - -static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __xsk_map_lookup_elem(map, *(u32 *)key); -} - -static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; - u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xsk_map_node *node; - struct socket *sock; - int err; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= m->map.max_entries)) - return -E2BIG; - - sock = sockfd_lookup(fd, &err); - if (!sock) - return err; - - if (sock->sk->sk_family != PF_XDP) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - xs = (struct xdp_sock *)sock->sk; - - if (!xsk_is_setup_for_bpf_map(xs)) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - map_entry = &m->xsk_map[i]; - node = xsk_map_node_alloc(m, map_entry); - if (IS_ERR(node)) { - sockfd_put(sock); - return PTR_ERR(node); - } - - spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); - if (old_xs == xs) { - err = 0; - goto out; - } else if (old_xs && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto out; - } else if (!old_xs && map_flags == BPF_EXIST) { - err = -ENOENT; - goto out; - } - xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - sockfd_put(sock); - return 0; - -out: - spin_unlock_bh(&m->lock); - sockfd_put(sock); - xsk_map_node_free(node); - return err; -} - -static int xsk_map_delete_elem(struct bpf_map *map, void *key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; - int k = *(u32 *)key; - - if (k >= map->max_entries) - return -EINVAL; - - spin_lock_bh(&m->lock); - map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - - return 0; -} - -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); - xsk_map_sock_delete(xs, map_entry); - } - spin_unlock_bh(&map->lock); -} - -const struct bpf_map_ops xsk_map_ops = { - .map_alloc = xsk_map_alloc, - .map_free = xsk_map_free, - .map_get_next_key = xsk_map_get_next_key, - .map_lookup_elem = xsk_map_lookup_elem, - .map_gen_lookup = xsk_map_gen_lookup, - .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, - .map_update_elem = xsk_map_update_elem, - .map_delete_elem = xsk_map_delete_elem, - .map_check_btf = map_check_no_btf, -}; -- cgit v1.2.3 From cac616db39c207dc63465a4e05c6ce0e60b2cce4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 May 2020 13:07:26 -0700 Subject: bpf: Verifier track null pointer branch_taken with JNE and JEQ Currently, when considering the branches that may be taken for a jump instruction if the register being compared is a pointer the verifier assumes both branches may be taken. But, if the jump instruction is comparing if a pointer is NULL we have this information in the verifier encoded in the reg->type so we can do better in these cases. Specifically, these two common cases can be handled. * If the instruction is BPF_JEQ and we are comparing against a zero value. This test is 'if ptr == 0 goto +X' then using the type information in reg->type we can decide if the ptr is not null. This allows us to avoid pushing both branches onto the stack and instead only use the != 0 case. For example PTR_TO_SOCK and PTR_TO_SOCK_OR_NULL encode the null pointer. Note if the type is PTR_TO_SOCK_OR_NULL we can not learn anything. And also if the value is non-zero we learn nothing because it could be any arbitrary value a different pointer for example * If the instruction is BPF_JNE and ware comparing against a zero value then a similar analysis as above can be done. The test in asm looks like 'if ptr != 0 goto +X'. Again using the type information if the non null type is set (from above PTR_TO_SOCK) we know the jump is taken. In this patch we extend is_branch_taken() to consider this extra information and to return only the branch that will be taken. This resolves a verifier issue reported with C code like the following. See progs/test_sk_lookup_kern.c in selftests. sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); bpf_printk("sk=%d\n", sk ? 1 : 0); if (sk) bpf_sk_release(sk); return sk ? TC_ACT_OK : TC_ACT_UNSPEC; In the above the bpf_printk() will resolve the pointer from PTR_TO_SOCK_OR_NULL to PTR_TO_SOCK. Then the second test guarding the release will cause the verifier to walk both paths resulting in the an unreleased sock reference. See verifier/ref_tracking.c in selftests for an assembly version of the above. After the above additional logic is added the C code above passes as expected. Reported-by: Andrey Ignatov Suggested-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159009164651.6313.380418298578070501.stgit@john-Precision-5820-Tower --- kernel/bpf/verifier.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ed8351f47a4..d2e27dba4ac6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -393,6 +393,15 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool reg_type_not_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_MAP_VALUE || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_BTF_ID; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || @@ -6308,8 +6317,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { - if (__is_pointer_value(false, reg)) - return -1; + if (__is_pointer_value(false, reg)) { + if (!reg_type_not_null(reg->type)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } if (is_jmp32) return is_branch32_taken(reg, val, opcode); @@ -6808,7 +6834,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (pred >= 0) { - err = mark_chain_precision(env, insn->dst_reg); + /* If we get here with a dst_reg pointer type it is because + * above is_branch_taken() special cased the 0 comparison. + */ + if (!__is_pointer_value(false, dst_reg)) + err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err) err = mark_chain_precision(env, insn->src_reg); if (err) -- cgit v1.2.3 From 48021f98130880dd74286459a1ef48b5e9bc374f Mon Sep 17 00:00:00 2001 From: Shreyas Joshi Date: Fri, 22 May 2020 16:53:06 +1000 Subject: printk: handle blank console arguments passed in. If uboot passes a blank string to console_setup then it results in a trashed memory. Ultimately, the kernel crashes during freeing up the memory. This fix checks if there is a blank parameter being passed to console_setup from uboot. In case it detects that the console parameter is blank then it doesn't setup the serial device and it gracefully exits. Link: https://lore.kernel.org/r/20200522065306.83-1-shreyas.joshi@biamp.com Signed-off-by: Shreyas Joshi Acked-by: Sergey Senozhatsky [pmladek@suse.com: Better format the commit message and code, remove unnecessary brackets.] Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 35cc5f548860..a3990505abdf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2200,6 +2200,9 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; + if (str[0] == 0) + return 1; + if (_braille_console_setup(&str, &brl_options)) return 1; -- cgit v1.2.3 From c6e7bd7afaeb3af55ffac122828035f1c01d1d7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 24 May 2020 21:29:55 +0100 Subject: sched/core: Optimize ttwu() spinning on p->on_cpu Both Rik and Mel reported seeing ttwu() spend significant time on: smp_cond_load_acquire(&p->on_cpu, !VAL); Attempt to avoid this by queueing the wakeup on the CPU that owns the p->on_cpu value. This will then allow the ttwu() to complete without further waiting. Since we run schedule() with interrupts disabled, the IPI is guaranteed to happen after p->on_cpu is cleared, this is what makes it safe to queue early. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Cc: Jirka Hladky Cc: Vincent Guittot Cc: valentin.schneider@arm.com Cc: Hillf Danton Cc: Rik van Riel Link: https://lore.kernel.org/r/20200524202956.27665-2-mgorman@techsingularity.net --- kernel/sched/core.c | 52 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa905b6daafa..903c9ee1db45 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2312,7 +2312,7 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } -static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +static void __ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -2354,6 +2354,17 @@ bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } + +static bool ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +{ + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ + __ttwu_queue_remote(p, cpu, wake_flags); + return true; + } + + return false; +} #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2362,11 +2373,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq_flags rf; #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - ttwu_queue_remote(p, cpu, wake_flags); + if (ttwu_queue_remote(p, cpu, wake_flags)) return; - } #endif rq_lock(rq, &rf); @@ -2548,7 +2556,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->on_rq && ttwu_remote(p, wake_flags)) goto unlock; + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + #ifdef CONFIG_SMP + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2570,6 +2586,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + */ + if (READ_ONCE(p->on_cpu) && ttwu_queue_remote(p, cpu, wake_flags)) + goto unlock; + /* * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. @@ -2581,28 +2607,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } - -#else /* CONFIG_SMP */ - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); -- cgit v1.2.3 From 2ebb17717550607bcd85fb8cf7d24ac870e9d762 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sun, 24 May 2020 21:29:56 +0100 Subject: sched/core: Offload wakee task activation if it the wakee is descheduling The previous commit: c6e7bd7afaeb: ("sched/core: Optimize ttwu() spinning on p->on_cpu") avoids spinning on p->on_rq when the task is descheduling, but only if the wakee is on a CPU that does not share cache with the waker. This patch offloads the activation of the wakee to the CPU that is about to go idle if the task is the only one on the runqueue. This potentially allows the waker task to continue making progress when the wakeup is not strictly synchronous. This is very obvious with netperf UDP_STREAM running on localhost. The waker is sending packets as quickly as possible without waiting for any reply. It frequently wakes the server for the processing of packets and when netserver is using local memory, it quickly completes the processing and goes back to idle. The waker often observes that netserver is on_rq and spins excessively leading to a drop in throughput. This is a comparison of 5.7-rc6 against "sched: Optimize ttwu() spinning on p->on_cpu" and against this patch labeled vanilla, optttwu-v1r1 and localwakelist-v1r2 respectively. 5.7.0-rc6 5.7.0-rc6 5.7.0-rc6 vanilla optttwu-v1r1 localwakelist-v1r2 Hmean send-64 251.49 ( 0.00%) 258.05 * 2.61%* 305.59 * 21.51%* Hmean send-128 497.86 ( 0.00%) 519.89 * 4.43%* 600.25 * 20.57%* Hmean send-256 944.90 ( 0.00%) 997.45 * 5.56%* 1140.19 * 20.67%* Hmean send-1024 3779.03 ( 0.00%) 3859.18 * 2.12%* 4518.19 * 19.56%* Hmean send-2048 7030.81 ( 0.00%) 7315.99 * 4.06%* 8683.01 * 23.50%* Hmean send-3312 10847.44 ( 0.00%) 11149.43 * 2.78%* 12896.71 * 18.89%* Hmean send-4096 13436.19 ( 0.00%) 13614.09 ( 1.32%) 15041.09 * 11.94%* Hmean send-8192 22624.49 ( 0.00%) 23265.32 * 2.83%* 24534.96 * 8.44%* Hmean send-16384 34441.87 ( 0.00%) 36457.15 * 5.85%* 35986.21 * 4.48%* Note that this benefit is not universal to all wakeups, it only applies to the case where the waker often spins on p->on_rq. The impact can be seen from a "perf sched latency" report generated from a single iteration of one packet size: ----------------------------------------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at | ----------------------------------------------------------------------------------------------------------------- vanilla netperf:4337 | 21709.193 ms | 2932 | avg: 0.002 ms | max: 0.041 ms | max at: 112.154512 s netserver:4338 | 14629.459 ms | 5146990 | avg: 0.001 ms | max: 1615.864 ms | max at: 140.134496 s localwakelist-v1r2 netperf:4339 | 29789.717 ms | 2460 | avg: 0.002 ms | max: 0.059 ms | max at: 138.205389 s netserver:4340 | 18858.767 ms | 7279005 | avg: 0.001 ms | max: 0.362 ms | max at: 135.709683 s ----------------------------------------------------------------------------------------------------------------- Note that the average wakeup delay is quite small on both the vanilla kernel and with the two patches applied. However, there are significant outliers with the vanilla kernel with the maximum one measured as 1615 milliseconds with a vanilla kernel but never worse than 0.362 ms with both patches applied and a much higher rate of context switching. Similarly a separate profile of cycles showed that 2.83% of all cycles were spent in try_to_wake_up() with almost half of the cycles spent on spinning on p->on_rq. With the two patches, the percentage of cycles spent in try_to_wake_up() drops to 1.13% Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Jirka Hladky Cc: Vincent Guittot Cc: valentin.schneider@arm.com Cc: Hillf Danton Cc: Rik van Riel Link: https://lore.kernel.org/r/20200524202956.27665-3-mgorman@techsingularity.net --- kernel/sched/core.c | 39 +++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 3 ++- 2 files changed, 35 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 903c9ee1db45..6130ab170e93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2312,7 +2312,13 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } -static void __ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +/* + * Queue a task on the target CPUs wake_list and wake the CPU via IPI if + * necessary. The wakee CPU on receipt of the IPI will queue the task + * via sched_ttwu_wakeup() for activation so the wakee incurs the cost + * of the wakeup instead of the waker. + */ +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -2355,11 +2361,32 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static bool ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +static inline bool ttwu_queue_cond(int cpu, int wake_flags) +{ + /* + * If the CPU does not share cache, then queue the task on the + * remote rqs wakelist to avoid accessing remote data. + */ + if (!cpus_share_cache(smp_processor_id(), cpu)) + return true; + + /* + * If the task is descheduling and the only running task on the + * CPU then use the wakelist to offload the task activation to + * the soon-to-be-idle CPU as the current CPU is likely busy. + * nr_running is checked to avoid unnecessary task stacking. + */ + if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + return true; + + return false; +} + +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - __ttwu_queue_remote(p, cpu, wake_flags); + __ttwu_queue_wakelist(p, cpu, wake_flags); return true; } @@ -2373,7 +2400,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq_flags rf; #if defined(CONFIG_SMP) - if (ttwu_queue_remote(p, cpu, wake_flags)) + if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; #endif @@ -2593,7 +2620,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * let the waker make forward progress. This is safe because IRQs are * disabled and the IPI will deliver after on_cpu is cleared. */ - if (READ_ONCE(p->on_cpu) && ttwu_queue_remote(p, cpu, wake_flags)) + if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) goto unlock; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f7ab6334e992..4b32cff0dcbe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1685,7 +1685,8 @@ static inline int task_on_rq_migrating(struct task_struct *p) */ #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_RQ 0x08 /* Wakee is on_rq */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution -- cgit v1.2.3 From 18f855e574d9799a0e7489f8ae6fd8447d0dd74a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 May 2020 09:38:31 -0600 Subject: sched/fair: Don't NUMA balance for kthreads Stefano reported a crash with using SQPOLL with io_uring: BUG: kernel NULL pointer dereference, address: 00000000000003b0 CPU: 2 PID: 1307 Comm: io_uring-sq Not tainted 5.7.0-rc7 #11 RIP: 0010:task_numa_work+0x4f/0x2c0 Call Trace: task_work_run+0x68/0xa0 io_sq_thread+0x252/0x3d0 kthread+0xf9/0x130 ret_from_fork+0x35/0x40 which is task_numa_work() oopsing on current->mm being NULL. The task work is queued by task_tick_numa(), which checks if current->mm is NULL at the time of the call. But this state isn't necessarily persistent, if the kthread is using use_mm() to temporarily adopt the mm of a task. Change the task_tick_numa() check to exclude kernel threads in general, as it doesn't make sense to attempt ot balance for kthreads anyway. Reported-by: Stefano Garzarella Signed-off-by: Jens Axboe Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/865de121-8190-5d30-ece5-3b097dc74431@kernel.dk --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 538ba5d94e99..da3e5b54715b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2908,7 +2908,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* -- cgit v1.2.3 From aaf2bc50df1f4bfc6857fc601fc7b21d5a18c6a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 May 2020 22:05:15 +0200 Subject: rcu: Abstract out rcu_irq_enter_check_tick() from rcu_nmi_enter() There will likely be exception handlers that can sleep, which rules out the usual approach of invoking rcu_nmi_enter() on entry and also rcu_nmi_exit() on all exit paths. However, the alternative approach of just not calling anything can prevent RCU from coaxing quiescent states from nohz_full CPUs that are looping in the kernel: RCU must instead IPI them explicitly. It would be better to enable the scheduler tick on such CPUs to interact with RCU in a lighter-weight manner, and this enabling is one of the things that rcu_nmi_enter() currently does. What is needed is something that helps RCU coax quiescent states while not preventing subsequent sleeps. This commit therefore splits out the nohz_full scheduler-tick enabling from the rest of the rcu_nmi_enter() logic into a new function named rcu_irq_enter_check_tick(). [ tglx: Renamed the function and made it a nop when context tracking is off ] [ mingo: Fixed a CONFIG_NO_HZ_FULL assumption, harmonized and fixed all the comment blocks and cleaned up rcu_nmi_enter()/exit() definitions. ] Suggested-by: Andy Lutomirski Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200521202116.996113173@linutronix.de --- kernel/rcu/tree.c | 82 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90c8be22d57a..b7f8c494d1d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -848,6 +848,67 @@ void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } + +/** + * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. + * + * The scheduler tick is not normally enabled when CPUs enter the kernel + * from nohz_full userspace execution. After all, nohz_full userspace + * execution is an RCU quiescent state and the time executing in the kernel + * is quite short. Except of course when it isn't. And it is not hard to + * cause a large system to spend tens of seconds or even minutes looping + * in the kernel, which can cause a number of problems, include RCU CPU + * stall warnings. + * + * Therefore, if a nohz_full CPU fails to report a quiescent state + * in a timely manner, the RCU grace-period kthread sets that CPU's + * ->rcu_urgent_qs flag with the expectation that the next interrupt or + * exception will invoke this function, which will turn on the scheduler + * tick, which will enable RCU to detect that CPU's quiescent states, + * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. + * The tick will be disabled once a quiescent state is reported for + * this CPU. + * + * Of course, in carefully tuned systems, there might never be an + * interrupt or exception. In that case, the RCU grace-period kthread + * will eventually cause one to happen. However, in less carefully + * controlled environments, this function allows RCU to get what it + * needs without creating otherwise useless interruptions. + */ +void __rcu_irq_enter_check_tick(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + // Enabling the tick is unsafe in NMI handlers. + if (WARN_ON_ONCE(in_nmi())) + return; + + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); + + if (!tick_nohz_full_cpu(rdp->cpu) || + !READ_ONCE(rdp->rcu_urgent_qs) || + READ_ONCE(rdp->rcu_forced_tick)) { + // RCU doesn't need nohz_full help from this CPU, or it is + // already getting that help. + return; + } + + // We get here only when not in an extended quiescent state and + // from interrupts (as opposed to NMIs). Therefore, (1) RCU is + // already watching and (2) The fact that we are in an interrupt + // handler and that the rcu_node lock is an irq-disabled lock + // prevents self-deadlock. So we can safely recheck under the lock. + // Note that the nohz_full state currently cannot change. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU needs a + // quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); +} #endif /* CONFIG_NO_HZ_FULL */ /** @@ -894,26 +955,7 @@ noinstr void rcu_nmi_enter(void) incby = 1; } else if (!in_nmi()) { instrumentation_begin(); - if (tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - // We get here only if we had already exited the - // extended quiescent state and this was an - // interrupt (not an NMI). Therefore, (1) RCU is - // already watching and (2) The fact that we are in - // an interrupt handler and that the rcu_node lock - // is an irq-disabled lock prevents self-deadlock. - // So we can safely recheck under the lock. - raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - // A nohz_full CPU is in the kernel and RCU - // needs a quiescent state. Turn on the tick! - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - } - raw_spin_unlock_rcu_node(rdp->mynode); - } + rcu_irq_enter_check_tick(); instrumentation_end(); } instrumentation_begin(); -- cgit v1.2.3 From 07325d4a90d2d84de45cc07b134fd0f023dbb971 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:16 +0200 Subject: rcu: Provide rcu_irq_exit_check_preempt() Provide a debug check which can be invoked from exception return to kernel mode before an attempt is made to schedule. Warn if RCU is not ready for this. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Paul E. McKenney Link: https://lore.kernel.org/r/20200521202117.089709607@linutronix.de --- kernel/rcu/tree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b7f8c494d1d1..d8e9dbbefcfa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -765,6 +765,24 @@ void rcu_irq_exit_preempt(void) "RCU in extended quiescent state!"); } +#ifdef CONFIG_PROVE_RCU +/** + * rcu_irq_exit_check_preempt - Validate that scheduling is possible + */ +void rcu_irq_exit_check_preempt(void) +{ + lockdep_assert_irqs_disabled(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} +#endif /* #ifdef CONFIG_PROVE_RCU */ + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * -- cgit v1.2.3 From 6b6ebb34744b21467aa01be7c53cc570fc41f70d Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Wed, 13 May 2020 10:13:11 +0800 Subject: cgroup: Remove stale comments - The default root is where we can create v2 cgroups. - The __DEVEL__sane_behavior mount option has been removed long long ago. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..7a016749de21 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -153,11 +153,7 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); -/* - * The default hierarchy, reserved for the subsystems that are otherwise - * unattached - it never has more than a single cgroup, and all tasks are - * part of that cgroup. - */ +/* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); @@ -251,9 +247,6 @@ bool cgroup_ssid_enabled(int ssid) * cases where a subsystem should behave differnetly depending on the * interface version. * - * The set of behaviors which change on the default hierarchy are still - * being determined and the mount option is prefixed with __DEVEL__. - * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" -- cgit v1.2.3 From 3234ac664a870e6ea69ae3a57d824cd7edbeacc5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 May 2020 14:06:17 -0700 Subject: /dev/mem: Revoke mappings when a driver claims the region Close the hole of holding a mapping over kernel driver takeover event of a given address range. Commit 90a545e98126 ("restrict /dev/mem to idle io memory ranges") introduced CONFIG_IO_STRICT_DEVMEM with the goal of protecting the kernel against scenarios where a /dev/mem user tramples memory that a kernel driver owns. However, this protection only prevents *new* read(), write() and mmap() requests. Established mappings prior to the driver calling request_mem_region() are left alone. Especially with persistent memory, and the core kernel metadata that is stored there, there are plentiful scenarios for a /dev/mem user to violate the expectations of the driver and cause amplified damage. Teach request_mem_region() to find and shoot down active /dev/mem mappings that it believes it has successfully claimed for the exclusive use of the driver. Effectively a driver call to request_mem_region() becomes a hole-punch on the /dev/mem device. The typical usage of unmap_mapping_range() is part of truncate_pagecache() to punch a hole in a file, but in this case the implementation is only doing the "first half" of a hole punch. Namely it is just evacuating current established mappings of the "hole", and it relies on the fact that /dev/mem establishes mappings in terms of absolute physical address offsets. Once existing mmap users are invalidated they can attempt to re-establish the mapping, or attempt to continue issuing read(2) / write(2) to the invalidated extent, but they will then be subject to the CONFIG_IO_STRICT_DEVMEM checking that can block those subsequent accesses. Cc: Arnd Bergmann Cc: Ingo Molnar Cc: Kees Cook Cc: Matthew Wilcox Cc: Russell King Cc: Andrew Morton Cc: Greg Kroah-Hartman Fixes: 90a545e98126 ("restrict /dev/mem to idle io memory ranges") Signed-off-by: Dan Williams Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 76036a41143b..841737bbda9e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1126,6 +1126,7 @@ struct resource * __request_region(struct resource *parent, { DECLARE_WAITQUEUE(wait, current); struct resource *res = alloc_resource(GFP_KERNEL); + struct resource *orig_parent = parent; if (!res) return NULL; @@ -1176,6 +1177,10 @@ struct resource * __request_region(struct resource *parent, break; } write_unlock(&resource_lock); + + if (res && orig_parent == &iomem_resource) + revoke_devmem(res); + return res; } EXPORT_SYMBOL(__request_region); -- cgit v1.2.3 From 342ed2400b78072cc01c0130ce41240dec60d56d Mon Sep 17 00:00:00 2001 From: Zhang Qiang Date: Wed, 27 May 2020 15:57:15 +0800 Subject: workqueue: Remove unnecessary kfree() call in rcu_free_wq() The data structure member "wq->rescuer" was reset to a null pointer in one if branch. It was passed to a call of the function "kfree" in the callback function "rcu_free_wq" (which was eventually executed). The function "kfree" does not perform more meaningful data processing for a passed null pointer (besides immediately returning from such a call). Thus delete this function call which became unnecessary with the referenced software update. Fixes: def98c84b6cd ("workqueue: Fix spurious sanity check failures in destroy_workqueue()") Suggested-by: Markus Elfring Signed-off-by: Zhang Qiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 10ed8d761e0b..7a1fc9fe6314 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3491,7 +3491,6 @@ static void rcu_free_wq(struct rcu_head *rcu) else free_workqueue_attrs(wq->unbound_attrs); - kfree(wq->rescuer); kfree(wq); } -- cgit v1.2.3 From ad1e4f74c072eaa2c6d77dd710db31aafecd614f Mon Sep 17 00:00:00 2001 From: Domenico Andreoli Date: Tue, 19 May 2020 20:14:10 +0200 Subject: PM: hibernate: Restrict writes to the resume device Hibernation via snapshot device requires write permission to the swap block device, the one that more often (but not necessarily) is used to store the hibernation image. With this patch, such permissions are granted iff: 1) snapshot device config option is enabled 2) swap partition is used as resume device In other circumstances the swap device is not writable from userspace. In order to achieve this, every write attempt to a swap device is checked against the device configured as part of the uswsusp API [0] using a pointer to the inode struct in memory. If the swap device being written was not configured for resuming, the write request is denied. NOTE: this implementation works only for swap block devices, where the inode configured by swapon (which sets S_SWAPFILE) is the same used by SNAPSHOT_SET_SWAP_AREA. In case of swap file, SNAPSHOT_SET_SWAP_AREA indeed receives the inode of the block device containing the filesystem where the swap file is located (+ offset in it) which is never passed to swapon and then has not set S_SWAPFILE. As result, the swap file itself (as a file) has never an option to be written from userspace. Instead it remains writable if accessed directly from the containing block device, which is always writeable from root. [0] Documentation/power/userland-swsusp.rst v2: - rename is_hibernate_snapshot_dev() to is_hibernate_resume_dev() - fix description so to correctly refer to the resume device Signed-off-by: Domenico Andreoli Acked-by: Darrick J. Wong Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 98548d1cf8a6..d5eedc2baa2a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,8 +35,14 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; + struct inode *bd_inode; } snapshot_state; +int is_hibernate_resume_dev(const struct inode *bd_inode) +{ + return hibernation_available() && snapshot_state.bd_inode == bd_inode; +} + static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; @@ -95,6 +101,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->frozen = false; data->ready = false; data->platform_support = false; + data->bd_inode = NULL; Unlock: unlock_system_sleep(); @@ -110,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; + data->bd_inode = NULL; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -202,6 +210,7 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { + struct block_device *bdev; sector_t offset; dev_t swdev; @@ -232,9 +241,12 @@ static int snapshot_set_swap_area(struct snapshot_data *data, data->swap = -1; return -EINVAL; } - data->swap = swap_type_of(swdev, offset, NULL); + data->swap = swap_type_of(swdev, offset, &bdev); if (data->swap < 0) return -ENODEV; + + data->bd_inode = bdev->bd_inode; + bdput(bdev); return 0; } -- cgit v1.2.3 From 806f04e9fd2c6ad1e39bc2dba77155be0e4becde Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 19:12:36 +0200 Subject: rcu: Allow for smp_call_function() running callbacks from idle Current RCU hard relies on smp_call_function() callbacks running from interrupt context. A pending optimization is going to break that, it will allow idle CPUs to run the callbacks from the idle loop. This avoids raising the IPI on the requesting CPU and avoids handling an exception on the receiving CPU. Change rcu_is_cpu_rrupt_from_idle() to also accept task context, provided it is the idle task. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20200527171236.GC706495@hirez.programming.kicks-ass.net --- kernel/rcu/tree.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90c8be22d57a..f51385b86ea3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -418,16 +418,23 @@ void rcu_momentary_dyntick_idle(void) EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); /** - * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle * * If the current CPU is idle and running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. + * interrupt, or directly, from idle, return true. + * + * The caller must have at least disabled IRQs. */ static int rcu_is_cpu_rrupt_from_idle(void) { - /* Called only from within the scheduling-clock interrupt */ - lockdep_assert_in_irq(); + long nesting; + + /* + * Usually called from the tick; but also used from smp_function_call() + * for expedited grace periods. This latter can result in running from + * the idle task, instead of an actual IPI. + */ + lockdep_assert_irqs_disabled(); /* Check for counter underflows */ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, @@ -436,9 +443,15 @@ static int rcu_is_cpu_rrupt_from_idle(void) "RCU dynticks_nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting); + if (nesting > 1) return false; + /* + * If we're not in an interrupt, we must be in the idle task! + */ + WARN_ON_ONCE(!nesting && !is_idle_task(current)); + /* Does CPU appear to be idle from an RCU standpoint? */ return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -- cgit v1.2.3 From 19a1f5ec699954d21be10f74ff71c2a7079e99ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:10:58 +0200 Subject: sched: Fix smp_call_function_single_async() usage for ILB The recent commit: 90b5363acd47 ("sched: Clean up scheduler_ipi()") got smp_call_function_single_async() subtly wrong. Even though it will return -EBUSY when trying to re-use a csd, that condition is not atomic and still requires external serialization. The change in kick_ilb() got this wrong. While on first reading kick_ilb() has an atomic test-and-set that appears to serialize the use, the matching 'release' is not in the right place to actually guarantee this serialization. Rework the nohz_idle_balance() trigger so that the release is in the IPI callback and thus guarantees the required serialization for the CSD. Fixes: 90b5363acd47 ("sched: Clean up scheduler_ipi()") Reported-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Cc: mgorman@techsingularity.net Link: https://lore.kernel.org/r/20200526161907.778543557@infradead.org --- kernel/sched/core.c | 36 ++++++++++-------------------------- kernel/sched/fair.c | 18 ++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 19 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95e457d4ed1c..2cacc1e44a84 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -637,41 +637,25 @@ void wake_up_nohz_cpu(int cpu) wake_up_idle_cpu(cpu); } -static inline bool got_nohz_idle_kick(void) +static void nohz_csd_func(void *info) { - int cpu = smp_processor_id(); - - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) - return false; - - if (idle_cpu(cpu) && !need_resched()) - return true; + struct rq *rq = info; + int cpu = cpu_of(rq); + unsigned int flags; /* - * We can't run Idle Load Balance on this CPU for this time so we - * cancel it and clear NOHZ_BALANCE_KICK + * Release the rq::nohz_csd. */ - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); - return false; -} + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); + WARN_ON(!(flags & NOHZ_KICK_MASK)); -static void nohz_csd_func(void *info) -{ - struct rq *rq = info; - - if (got_nohz_idle_kick()) { - rq->idle_balance = 1; + rq->idle_balance = idle_cpu(cpu); + if (rq->idle_balance && !need_resched()) { + rq->nohz_idle_balance = flags; raise_softirq_irqoff(SCHED_SOFTIRQ); } } -#else /* CONFIG_NO_HZ_COMMON */ - -static inline bool got_nohz_idle_kick(void) -{ - return false; -} - #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dda9b194d225..2890bd54a088 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10024,6 +10024,10 @@ static void kick_ilb(unsigned int flags) if (ilb_cpu >= nr_cpu_ids) return; + /* + * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets + * the first flag owns it; cleared by nohz_csd_func(). + */ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return; @@ -10371,20 +10375,14 @@ abort: */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - int this_cpu = this_rq->cpu; - unsigned int flags; + unsigned int flags = this_rq->nohz_idle_balance; - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + if (!flags) return false; - if (idle != CPU_IDLE) { - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - return false; - } + this_rq->nohz_idle_balance = 0; - /* could be _relaxed() */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - if (!(flags & NOHZ_KICK_MASK)) + if (idle != CPU_IDLE) return false; _nohz_idle_balance(this_rq, flags, idle); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4b32cff0dcbe..3c163cb5493f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -951,6 +951,7 @@ struct rq { struct callback_head *balance_callback; + unsigned char nohz_idle_balance; unsigned char idle_balance; unsigned long misfit_task_load; -- cgit v1.2.3 From 52103be07d8b08311955f8c30e535c2dda290cf4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:10:59 +0200 Subject: smp: Optimize flush_smp_call_function_queue() The call_single_queue can contain (two) different callbacks, synchronous and asynchronous. The current interrupt handler runs them in-order, which means that remote CPUs that are waiting for their synchronous call can be delayed by running asynchronous callbacks. Rework the interrupt handler to first run the synchonous callbacks. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161907.836818381@infradead.org --- kernel/smp.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 786092aabdcd..db2f73808db5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -209,9 +209,9 @@ void generic_smp_call_function_single_interrupt(void) */ static void flush_smp_call_function_queue(bool warn_cpu_offline) { - struct llist_head *head; - struct llist_node *entry; call_single_data_t *csd, *csd_next; + struct llist_node *entry, *prev; + struct llist_head *head; static bool warned; lockdep_assert_irqs_disabled(); @@ -235,20 +235,39 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd->func); } + /* + * First; run all SYNC callbacks, people are waiting for us. + */ + prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { smp_call_func_t func = csd->func; void *info = csd->info; /* Do we wait until *after* callback? */ if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } func(info); csd_unlock(csd); } else { - csd_unlock(csd); - func(info); + prev = &csd->llist; } } + /* + * Second; run all !SYNC callbacks. + */ + llist_for_each_entry_safe(csd, csd_next, entry, llist) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } + /* * Handle irq works queued remotely by irq_work_queue_on(). * Smp functions above are typically synchronous so they -- cgit v1.2.3 From afaa653c564da38c0b34c4baba31e88c46a8764c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:00 +0200 Subject: smp: Move irq_work_run() out of flush_smp_call_function_queue() This ensures flush_smp_call_function_queue() is strictly about call_single_queue. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161907.895109676@infradead.org --- kernel/smp.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index db2f73808db5..f720e38e880d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -84,6 +84,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * still pending. */ flush_smp_call_function_queue(false); + irq_work_run(); return 0; } @@ -191,6 +192,14 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, void generic_smp_call_function_single_interrupt(void) { flush_smp_call_function_queue(true); + + /* + * Handle irq works queued remotely by irq_work_queue_on(). + * Smp functions above are typically synchronous so they + * better run first since some other CPUs may be busy waiting + * for them. + */ + irq_work_run(); } /** @@ -267,14 +276,6 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd_unlock(csd); func(info); } - - /* - * Handle irq works queued remotely by irq_work_queue_on(). - * Smp functions above are typically synchronous so they - * better run first since some other CPUs may be busy waiting - * for them. - */ - irq_work_run(); } /* -- cgit v1.2.3 From b2a02fc43a1f40ef4eb2fb2b06357382608d4d84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:01 +0200 Subject: smp: Optimize send_call_function_single_ipi() Just like the ttwu_queue_remote() IPI, make use of _TIF_POLLING_NRFLAG to avoid sending IPIs to idle CPUs. [ mingo: Fix UP build bug. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161907.953304789@infradead.org --- kernel/sched/core.c | 10 ++++++++++ kernel/sched/idle.c | 5 +++++ kernel/sched/sched.h | 7 ++++--- kernel/smp.c | 16 +++++++++++++++- 4 files changed, 34 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2cacc1e44a84..fa0d4990618e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2296,6 +2296,16 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } +void send_call_function_single_ipi(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!set_nr_if_polling(rq->idle)) + arch_send_call_function_single_ipi(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + /* * Queue a task on the target CPUs wake_list and wake the CPU via IPI if * necessary. The wakee CPU on receipt of the IPI will queue the task diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b743bf38f08f..387fd75ee6f7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -289,6 +289,11 @@ static void do_idle(void) */ smp_mb__after_atomic(); + /* + * RCU relies on this call to be done outside of an RCU read-side + * critical section. + */ + flush_smp_call_function_from_idle(); sched_ttwu_pending(); schedule_idle(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3c163cb5493f..75b062999c43 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1506,11 +1506,12 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -#else +extern void flush_smp_call_function_from_idle(void); +#else /* !CONFIG_SMP: */ +static inline void flush_smp_call_function_from_idle(void) { } static inline void sched_ttwu_pending(void) { } - -#endif /* CONFIG_SMP */ +#endif #include "stats.h" #include "autogroup.h" diff --git a/kernel/smp.c b/kernel/smp.c index f720e38e880d..9f1181375141 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -135,6 +135,8 @@ static __always_inline void csd_unlock(call_single_data_t *csd) static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); +extern void send_call_function_single_ipi(int cpu); + /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have @@ -178,7 +180,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, * equipped to do the right thing... */ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - arch_send_call_function_single_ipi(cpu); + send_call_function_single_ipi(cpu); return 0; } @@ -278,6 +280,18 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } } +void flush_smp_call_function_from_idle(void) +{ + unsigned long flags; + + if (llist_empty(this_cpu_ptr(&call_single_queue))) + return; + + local_irq_save(flags); + flush_smp_call_function_queue(true); + local_irq_restore(flags); +} + /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. -- cgit v1.2.3 From 4b44a21dd640b692d4e9b12d3e37c24825f90baa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:02 +0200 Subject: irq_work, smp: Allow irq_work on call_single_queue Currently irq_work_queue_on() will issue an unconditional arch_send_call_function_single_ipi() and has the handler do irq_work_run(). This is unfortunate in that it makes the IPI handler look at a second cacheline and it misses the opportunity to avoid the IPI. Instead note that struct irq_work and struct __call_single_data are very similar in layout, so use a few bits in the flags word to encode a type and stick the irq_work on the call_single_queue list. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161908.011635912@infradead.org --- kernel/irq_work.c | 53 +++++++++++++----------- kernel/smp.c | 119 +++++++++++++++++++++++++++++++++--------------------- 2 files changed, 102 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 48b5d1b6af4d..eca83965b631 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work) { int oflags; - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); /* * If the work is already pending, no need to raise the IPI. * The pairing atomic_fetch_andnot() in irq_work_run() makes sure @@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) - arch_send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &work->llnode); } else { __irq_work_queue_local(work); } @@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void) return true; } +void irq_work_single(void *arg) +{ + struct irq_work *work = arg; + int flags; + + /* + * Clear the PENDING bit, after this point the @work + * can be re-used. + * Make it immediately visible so that other CPUs trying + * to claim that work don't rely on us to handle their data + * while we are in the middle of the func. + */ + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + + lockdep_irq_work_enter(work); + work->func(work); + lockdep_irq_work_exit(work); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); +} + static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; @@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry_safe(work, tmp, llnode, llnode) { - int flags; - /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - * Make it immediately visible so that other CPUs trying - * to claim that work don't rely on us to handle their data - * while we are in the middle of the func. - */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); - - lockdep_irq_work_enter(work); - work->func(work); - lockdep_irq_work_exit(work); - /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. - */ - flags &= ~IRQ_WORK_PENDING; - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); - } + llist_for_each_entry_safe(work, tmp, llnode, llnode) + irq_work_single(work); } /* diff --git a/kernel/smp.c b/kernel/smp.c index 9f1181375141..856562b80794 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -23,10 +23,8 @@ #include "smpboot.h" -enum { - CSD_FLAG_LOCK = 0x01, - CSD_FLAG_SYNCHRONOUS = 0x02, -}; + +#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; @@ -137,15 +135,33 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); extern void send_call_function_single_ipi(int cpu); +void __smp_call_single_queue(int cpu, struct llist_node *node) +{ + /* + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... + */ + if (llist_add(node, &per_cpu(call_single_queue, cpu))) + send_call_function_single_ipi(cpu); +} + /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd, - smp_call_func_t func, void *info) +static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { + smp_call_func_t func = csd->func; + void *info = csd->info; unsigned long flags; /* @@ -159,28 +175,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, return 0; } - if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } - csd->func = func; - csd->info = info; - - /* - * The list addition should be visible before sending the IPI - * handler locks the list to pull the entry off it because of - * normal cache coherency rules implied by spinlocks. - * - * If IPIs can go out of order to the cache coherency protocol - * in an architecture, sufficient synchronisation should be added - * to arch code to make it appear to obey cache coherency WRT - * locking and barrier primitives. Generic code isn't really - * equipped to do the right thing... - */ - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &csd->llist); return 0; } @@ -194,16 +194,10 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, void generic_smp_call_function_single_interrupt(void) { flush_smp_call_function_queue(true); - - /* - * Handle irq works queued remotely by irq_work_queue_on(). - * Smp functions above are typically synchronous so they - * better run first since some other CPUs may be busy waiting - * for them. - */ - irq_work_run(); } +extern void irq_work_single(void *); + /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * @@ -241,9 +235,21 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ - llist_for_each_entry(csd, entry, llist) - pr_warn("IPI callback %pS sent to offline CPU\n", - csd->func); + llist_for_each_entry(csd, entry, llist) { + switch (CSD_TYPE(csd)) { + case CSD_TYPE_ASYNC: + case CSD_TYPE_SYNC: + case CSD_TYPE_IRQ_WORK: + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + break; + + default: + pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", + CSD_TYPE(csd)); + break; + } + } } /* @@ -251,16 +257,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; - /* Do we wait until *after* callback? */ - if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + if (prev) { prev->next = &csd_next->llist; } else { entry = &csd_next->llist; } + func(info); csd_unlock(csd); } else { @@ -272,11 +279,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * Second; run all !SYNC callbacks. */ llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; + int type = CSD_TYPE(csd); - csd_unlock(csd); - func(info); + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } } } @@ -305,7 +318,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, { call_single_data_t *csd; call_single_data_t csd_stack = { - .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }; int this_cpu; int err; @@ -339,7 +352,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd_lock(csd); } - err = generic_exec_single(cpu, csd, func, info); + csd->func = func; + csd->info = info; + + err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); @@ -385,7 +401,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) csd->flags = CSD_FLAG_LOCK; smp_wmb(); - err = generic_exec_single(cpu, csd, csd->func, csd->info); + err = generic_exec_single(cpu, csd); out: preempt_enable(); @@ -500,7 +516,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd_lock(csd); if (wait) - csd->flags |= CSD_FLAG_SYNCHRONOUS; + csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) @@ -632,6 +648,17 @@ void __init smp_init(void) { int num_nodes, num_cpus; + /* + * Ensure struct irq_work layout matches so that + * flush_smp_call_function_queue() can do horrible things. + */ + BUILD_BUG_ON(offsetof(struct irq_work, llnode) != + offsetof(struct __call_single_data, llist)); + BUILD_BUG_ON(offsetof(struct irq_work, func) != + offsetof(struct __call_single_data, func)); + BUILD_BUG_ON(offsetof(struct irq_work, flags) != + offsetof(struct __call_single_data, flags)); + idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From 126c2092e5c8b28623cb890cd2930aa292410676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:03 +0200 Subject: sched: Add rq::ttwu_pending In preparation of removing rq->wake_list, replace the !list_empty(rq->wake_list) with rq->ttwu_pending. This is not fully equivalent as this new variable is racy. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161908.070399698@infradead.org --- kernel/sched/core.c | 13 +++++++++++-- kernel/sched/debug.c | 1 - kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 +++- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa0d4990618e..b71ed5ee97fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2275,13 +2275,21 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) void sched_ttwu_pending(void) { struct rq *rq = this_rq(); - struct llist_node *llist = llist_del_all(&rq->wake_list); + struct llist_node *llist; struct task_struct *p, *t; struct rq_flags rf; + llist = llist_del_all(&rq->wake_list); if (!llist) return; + /* + * rq::ttwu_pending racy indication of out-standing wakeups. + * Races such that false-negatives are possible, since they + * are shorter lived that false-positives would be. + */ + WRITE_ONCE(rq->ttwu_pending, 0); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -2318,6 +2326,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); + WRITE_ONCE(rq->ttwu_pending, 1); if (llist_add(&p->wake_entry, &rq->wake_list)) { if (!set_nr_if_polling(rq->idle)) smp_call_function_single_async(cpu, &rq->wake_csd); @@ -4705,7 +4714,7 @@ int idle_cpu(int cpu) return 0; #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1c24a6bbdae2..36c54265bb2b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -638,7 +638,6 @@ do { \ P(nr_running); P(nr_switches); - P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2890bd54a088..0ed04d2a8959 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8590,7 +8590,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p) */ #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 75b062999c43..c86fc94c54e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -895,7 +895,9 @@ struct rq { atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - unsigned long nr_load_updates; +#ifdef CONFIG_SMP + unsigned int ttwu_pending; +#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK -- cgit v1.2.3 From a148866489fbe243c936fe43e4525d8dbfa0318f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:04 +0200 Subject: sched: Replace rq::wake_list The recent commit: 90b5363acd47 ("sched: Clean up scheduler_ipi()") got smp_call_function_single_async() subtly wrong. Even though it will return -EBUSY when trying to re-use a csd, that condition is not atomic and still requires external serialization. The change in ttwu_queue_remote() got this wrong. While on first reading ttwu_queue_remote() has an atomic test-and-set that appears to serialize the use, the matching 'release' is not in the right place to actually guarantee this serialization. The actual race is vs the sched_ttwu_pending() call in the idle loop; that can run the wakeup-list without consuming the CSD. Instead of trying to chain the lists, merge them. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161908.129371594@infradead.org --- kernel/sched/core.c | 25 +++++++------------------ kernel/sched/idle.c | 1 - kernel/sched/sched.h | 8 -------- kernel/smp.c | 47 ++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 47 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b71ed5ee97fd..b3c64c6b4d2e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1538,7 +1538,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - sched_ttwu_pending(); + flush_smp_call_function_from_idle(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -2272,14 +2272,13 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -void sched_ttwu_pending(void) +void sched_ttwu_pending(void *arg) { + struct llist_node *llist = arg; struct rq *rq = this_rq(); - struct llist_node *llist; struct task_struct *p, *t; struct rq_flags rf; - llist = llist_del_all(&rq->wake_list); if (!llist) return; @@ -2299,11 +2298,6 @@ void sched_ttwu_pending(void) rq_unlock_irqrestore(rq, &rf); } -static void wake_csd_func(void *info) -{ - sched_ttwu_pending(); -} - void send_call_function_single_ipi(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -2327,12 +2321,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - if (llist_add(&p->wake_entry, &rq->wake_list)) { - if (!set_nr_if_polling(rq->idle)) - smp_call_function_single_async(cpu, &rq->wake_csd); - else - trace_sched_wake_idle_without_ipi(cpu); - } + __smp_call_single_queue(cpu, &p->wake_entry); } void wake_up_if_idle(int cpu) @@ -2772,6 +2761,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); +#ifdef CONFIG_SMP + p->wake_entry_type = CSD_TYPE_TTWU; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -6564,7 +6556,6 @@ int sched_cpu_dying(unsigned int cpu) struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ - sched_ttwu_pending(); sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); @@ -6763,8 +6754,6 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; - rq_csd_init(rq, &rq->wake_csd, wake_csd_func); - INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 387fd75ee6f7..05deb81bb3e3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -294,7 +294,6 @@ static void do_idle(void) * critical section. */ flush_smp_call_function_from_idle(); - sched_ttwu_pending(); schedule_idle(); if (unlikely(klp_patch_pending(current))) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c86fc94c54e5..1d4e94c1e5fe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1023,11 +1023,6 @@ struct rq { unsigned int ttwu_local; #endif -#ifdef CONFIG_SMP - call_single_data_t wake_csd; - struct llist_head wake_list; -#endif - #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; @@ -1371,8 +1366,6 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -extern void sched_ttwu_pending(void); - #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -1512,7 +1505,6 @@ extern void flush_smp_call_function_from_idle(void); #else /* !CONFIG_SMP: */ static inline void flush_smp_call_function_from_idle(void) { } -static inline void sched_ttwu_pending(void) { } #endif #include "stats.h" diff --git a/kernel/smp.c b/kernel/smp.c index 856562b80794..0d61dc060b01 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -196,6 +196,7 @@ void generic_smp_call_function_single_interrupt(void) flush_smp_call_function_queue(true); } +extern void sched_ttwu_pending(void *); extern void irq_work_single(void *); /** @@ -244,6 +245,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd->func); break; + case CSD_TYPE_TTWU: + pr_warn("IPI task-wakeup sent to offline CPU\n"); + break; + default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); @@ -275,22 +280,43 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } } + if (!entry) + return; + /* * Second; run all !SYNC callbacks. */ + prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { int type = CSD_TYPE(csd); - if (type == CSD_TYPE_ASYNC) { - smp_call_func_t func = csd->func; - void *info = csd->info; + if (type != CSD_TYPE_TTWU) { + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } - csd_unlock(csd); - func(info); - } else if (type == CSD_TYPE_IRQ_WORK) { - irq_work_single(csd); + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } + + } else { + prev = &csd->llist; } } + + /* + * Third; only CSD_TYPE_TTWU is left, issue those. + */ + if (entry) + sched_ttwu_pending(entry); } void flush_smp_call_function_from_idle(void) @@ -659,6 +685,13 @@ void __init smp_init(void) BUILD_BUG_ON(offsetof(struct irq_work, flags) != offsetof(struct __call_single_data, flags)); + /* + * Assert the CSD_TYPE_TTWU layout is similar enough + * for task_struct to be on the @call_single_queue. + */ + BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != + offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); + idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From 1f8db4150536431b031585ecc2a6793f69245de2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 May 2020 11:01:34 +0200 Subject: sched/headers: Split out open-coded prototypes into kernel/sched/smp.h Move the prototypes for sched_ttwu_pending() and send_call_function_single_ipi() into the newly created kernel/sched/smp.h header, to make sure they are all the same, and to architectures happy that use -Wmissing-prototypes. Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/smp.h | 9 +++++++++ kernel/smp.c | 5 +---- 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 kernel/sched/smp.h (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b3c64c6b4d2e..43ba2d4a8eca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -20,6 +20,7 @@ #include "../smpboot.h" #include "pelt.h" +#include "smp.h" #define CREATE_TRACE_POINTS #include diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h new file mode 100644 index 000000000000..9620e323162c --- /dev/null +++ b/kernel/sched/smp.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal SMP callback types and methods between the scheduler + * and other internal parts of the core kernel: + */ + +extern void sched_ttwu_pending(void *arg); + +extern void send_call_function_single_ipi(int cpu); diff --git a/kernel/smp.c b/kernel/smp.c index 0d61dc060b01..4dec04f7fdc5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -22,7 +22,7 @@ #include #include "smpboot.h" - +#include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) @@ -133,8 +133,6 @@ static __always_inline void csd_unlock(call_single_data_t *csd) static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); -extern void send_call_function_single_ipi(int cpu); - void __smp_call_single_queue(int cpu, struct llist_node *node) { /* @@ -196,7 +194,6 @@ void generic_smp_call_function_single_interrupt(void) flush_smp_call_function_queue(true); } -extern void sched_ttwu_pending(void *); extern void irq_work_single(void *); /** -- cgit v1.2.3 From 1d0326f352bb094771df17f045bdbadff89a43e6 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 14 May 2020 02:25:55 +0200 Subject: genirq: Check irq_data_get_irq_chip() return value before use irq_data_get_irq_chip() can return NULL, however it is expected that this never happens. If a buggy driver leads to NULL being returned from irq_data_get_irq_chip(), warn about it instead of crashing the machine. Signed-off-by: Marek Vasut Signed-off-by: Thomas Gleixner To: linux-arm-kernel@lists.infradead.org --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 453a8a0f4804..761911168438 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2619,6 +2619,8 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2696,6 +2698,8 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -- cgit v1.2.3 From 936f2a70f2077f64fab1dcb3eca71879e82ecd3f Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 27 May 2020 14:43:19 -0700 Subject: cgroup: add cpu.stat file to root cgroup Currently, the root cgroup does not have a cpu.stat file. Add one which is consistent with /proc/stat to capture global cpu statistics that might not fall under cgroup accounting. We haven't done this in the past because the data are already presented in /proc/stat and we didn't want to add overhead from collecting root cgroup stats when cgroups are configured, but no cgroups have been created. By keeping the data consistent with /proc/stat, I think we avoid the first problem, while improving the usability of cgroups stats. We avoid the second problem by computing the contents of cpu.stat from existing data collected for /proc/stat anyway. Signed-off-by: Boris Burkov Suggested-by: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 - kernel/cgroup/rstat.c | 60 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7a016749de21..51924ebdff51 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4874,7 +4874,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, #ifdef CONFIG_PSI diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 41ca996568df..b6397a186ce9 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -389,18 +389,62 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, cgroup_base_stat_cputime_account_end(cgrp, rstatc); } +/* + * compute the cputime for the root cgroup by getting the per cpu data + * at a global level, then categorizing the fields in a manner consistent + * with how it is done by __cgroup_account_cputime_field for each bit of + * cpu time attributed to a cgroup. + */ +static void root_cgroup_cputime(struct task_cputime *cputime) +{ + int i; + + cputime->stime = 0; + cputime->utime = 0; + cputime->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + u64 user = 0; + u64 sys = 0; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + user += cpustat[CPUTIME_NICE]; + cputime->utime += user; + + sys += cpustat[CPUTIME_SYSTEM]; + sys += cpustat[CPUTIME_IRQ]; + sys += cpustat[CPUTIME_SOFTIRQ]; + cputime->stime += sys; + + cputime->sum_exec_runtime += user; + cputime->sum_exec_runtime += sys; + cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; + } +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; - - if (!cgroup_parent(cgrp)) - return; - - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); - cgroup_rstat_flush_release(); + struct task_cputime cputime; + + if (cgroup_parent(cgrp)) { + cgroup_rstat_flush_hold(cgrp); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, + &utime, &stime); + cgroup_rstat_flush_release(); + } else { + root_cgroup_cputime(&cputime); + usage = cputime.sum_exec_runtime; + utime = cputime.utime; + stime = cputime.stime; + } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); -- cgit v1.2.3 From d8bb65ab70f702531aaaa11d9710f9450078e295 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 May 2020 21:46:32 +0200 Subject: workqueue: Use rcuwait for wq_manager_wait The workqueue code has it's internal spinlock (pool::lock) and also implicit spinlock usage in the wq_manager waitqueue. These spinlocks are converted to 'sleeping' spinlocks on a RT-kernel. Workqueue functions can be invoked from contexts which are truly atomic even on a PREEMPT_RT enabled kernel. Taking sleeping locks from such contexts is forbidden. pool::lock can be converted to a raw spinlock as the lock held times are short. But the workqueue manager waitqueue is handled inside of pool::lock held regions which again violates the lock nesting rules of raw and regular spinlocks. The manager waitqueue has no special requirements like custom wakeup callbacks or mass wakeups. While it does not use exclusive wait mode explicitly there is no strict requirement to queue the waiters in a particular order as there is only one waiter at a time. This allows to replace the waitqueue with rcuwait which solves the locking problem because rcuwait relies on existing locking. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7a1fc9fe6314..7c3566f8e4ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -301,7 +301,8 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ +/* wait for manager to go away */ +static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -2140,7 +2141,7 @@ static bool manage_workers(struct worker *worker) pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; - wake_up(&wq_manager_wait); + rcuwait_wake_up(&manager_wait); return true; } @@ -3503,6 +3504,18 @@ static void rcu_free_pool(struct rcu_head *rcu) kfree(pool); } +/* This returns with the lock held on success (pool manager is inactive). */ +static bool wq_manager_inactive(struct worker_pool *pool) +{ + spin_lock_irq(&pool->lock); + + if (pool->flags & POOL_MANAGER_ACTIVE) { + spin_unlock_irq(&pool->lock); + return false; + } + return true; +} + /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put @@ -3538,10 +3551,11 @@ static void put_unbound_pool(struct worker_pool *pool) * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. + * Because of how wq_manager_inactive() works, we will hold the + * spinlock after a successful wait. */ - spin_lock_irq(&pool->lock); - wait_event_lock_irq(wq_manager_wait, - !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), + TASK_UNINTERRUPTIBLE); pool->flags |= POOL_MANAGER_ACTIVE; while ((worker = first_idle_worker(pool))) -- cgit v1.2.3 From a9b8a985294debae00f6c087dfec8c384d30a3b9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 May 2020 21:46:33 +0200 Subject: workqueue: Convert the pool::lock and wq_mayday_lock to raw_spinlock_t The workqueue code has it's internal spinlocks (pool::lock), which are acquired on most workqueue operations. These spinlocks are converted to 'sleeping' spinlocks on a RT-kernel. Workqueue functions can be invoked from contexts which are truly atomic even on a PREEMPT_RT enabled kernel. Taking sleeping locks from such contexts is forbidden. The pool::lock hold times are bound and the code sections are relatively short, which allows to convert pool::lock and as a consequence wq_mayday_lock to raw spinlocks which are truly spinning locks even on a PREEMPT_RT kernel. With the previous conversion of the manager waitqueue to a simple waitqueue workqueues are now fully RT compliant. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 176 ++++++++++++++++++++++++++--------------------------- 1 file changed, 88 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7c3566f8e4ca..82f85f5d81a8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -145,7 +145,7 @@ enum { /* struct worker is defined in workqueue_internal.h */ struct worker_pool { - spinlock_t lock; /* the pool lock */ + raw_spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ @@ -300,7 +300,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ -static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ /* wait for manager to go away */ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); @@ -827,7 +827,7 @@ static struct worker *first_idle_worker(struct worker_pool *pool) * Wake up the first idle worker of @pool. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void wake_up_worker(struct worker_pool *pool) { @@ -882,7 +882,7 @@ void wq_worker_sleeping(struct task_struct *task) return; worker->sleeping = 1; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, @@ -901,7 +901,7 @@ void wq_worker_sleeping(struct task_struct *task) if (next) wake_up_process(next->task); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** @@ -912,7 +912,7 @@ void wq_worker_sleeping(struct task_struct *task) * the scheduler to get a worker's last known identity. * * CONTEXT: - * spin_lock_irq(rq->lock) + * raw_spin_lock_irq(rq->lock) * * This function is called during schedule() when a kworker is going * to sleep. It's used by psi to identify aggregation workers during @@ -943,7 +943,7 @@ work_func_t wq_worker_last_func(struct task_struct *task) * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { @@ -968,7 +968,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * Clear @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { @@ -1016,7 +1016,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL @@ -1051,7 +1051,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * nested inside outer list_for_each_entry_safe(). * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) @@ -1129,9 +1129,9 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } } @@ -1164,7 +1164,7 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { @@ -1263,7 +1263,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!pool) goto fail; - spin_lock(&pool->lock); + raw_spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point @@ -1292,11 +1292,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; } - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); local_irq_restore(*flags); @@ -1317,7 +1317,7 @@ fail: * work_struct flags. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) @@ -1434,7 +1434,7 @@ retry: if (last_pool && last_pool != pwq->pool) { struct worker *worker; - spin_lock(&last_pool->lock); + raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); @@ -1442,11 +1442,11 @@ retry: pwq = worker->current_pwq; } else { /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); + raw_spin_unlock(&last_pool->lock); + raw_spin_lock(&pwq->pool->lock); } } else { - spin_lock(&pwq->pool->lock); + raw_spin_lock(&pwq->pool->lock); } /* @@ -1459,7 +1459,7 @@ retry: */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); cpu_relax(); goto retry; } @@ -1491,7 +1491,7 @@ retry: insert_work(pwq, work, worklist, work_flags); out: - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); rcu_read_unlock(); } @@ -1760,7 +1760,7 @@ EXPORT_SYMBOL(queue_rcu_work); * necessary. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { @@ -1800,7 +1800,7 @@ static void worker_enter_idle(struct worker *worker) * @worker is leaving idle state. Update stats. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { @@ -1938,11 +1938,11 @@ static struct worker *create_worker(struct worker_pool *pool) worker_attach_to_pool(worker, pool); /* start the newly created worker */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); return worker; @@ -1961,7 +1961,7 @@ fail: * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { @@ -1987,7 +1987,7 @@ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; @@ -2005,7 +2005,7 @@ static void idle_worker_timeout(struct timer_list *t) destroy_worker(worker); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } static void send_mayday(struct work_struct *work) @@ -2036,8 +2036,8 @@ static void pool_mayday_timeout(struct timer_list *t) struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; - spin_lock_irq(&pool->lock); - spin_lock(&wq_mayday_lock); /* for wq->maydays */ + raw_spin_lock_irq(&pool->lock); + raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* @@ -2050,8 +2050,8 @@ static void pool_mayday_timeout(struct timer_list *t) send_mayday(work); } - spin_unlock(&wq_mayday_lock); - spin_unlock_irq(&pool->lock); + raw_spin_unlock(&wq_mayday_lock); + raw_spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2070,7 +2070,7 @@ static void pool_mayday_timeout(struct timer_list *t) * may_start_working() %true. * * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ @@ -2079,7 +2079,7 @@ __releases(&pool->lock) __acquires(&pool->lock) { restart: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); @@ -2095,7 +2095,7 @@ restart: } del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have @@ -2118,7 +2118,7 @@ restart: * and may_start_working() is true. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: @@ -2157,7 +2157,7 @@ static bool manage_workers(struct worker *worker) * call this function to process a work. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * raw_spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) @@ -2239,7 +2239,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); @@ -2294,7 +2294,7 @@ __acquires(&pool->lock) */ cond_resched(); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* clear cpu intensive status */ if (unlikely(cpu_intensive)) @@ -2320,7 +2320,7 @@ __acquires(&pool->lock) * fetches a work from the top and executes it. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) @@ -2362,11 +2362,11 @@ static int worker_thread(void *__worker) /* tell the scheduler that this is a workqueue worker */ set_pf_worker(true); woke_up: - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); set_pf_worker(false); @@ -2432,7 +2432,7 @@ sleep: */ worker_enter_idle(worker); __set_current_state(TASK_IDLE); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } @@ -2486,7 +2486,7 @@ repeat: should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, @@ -2498,11 +2498,11 @@ repeat: __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * Slurp in all works issued via this workqueue and @@ -2531,7 +2531,7 @@ repeat: * incur MAYDAY_INTERVAL delay inbetween. */ if (need_to_create_worker(pool)) { - spin_lock(&wq_mayday_lock); + raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction * and somebody else hasn't queued it already. @@ -2540,7 +2540,7 @@ repeat: get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } - spin_unlock(&wq_mayday_lock); + raw_spin_unlock(&wq_mayday_lock); } } @@ -2558,14 +2558,14 @@ repeat: if (need_more_worker(pool)) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer); - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); } - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); @@ -2645,7 +2645,7 @@ static void wq_barrier_func(struct work_struct *work) * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, @@ -2732,7 +2732,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2749,7 +2749,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) @@ -2949,9 +2949,9 @@ reflush: for_each_pwq(pwq, wq) { bool drained; - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2987,7 +2987,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, return false; } - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3003,7 +3003,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, check_flush_dependency(pwq->wq, work); insert_wq_barrier(pwq, barr, work, worker); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* * Force a lock recursion deadlock when using flush_work() inside a @@ -3022,7 +3022,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, rcu_read_unlock(); return true; already_gone: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); rcu_read_unlock(); return false; } @@ -3415,7 +3415,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, */ static int init_worker_pool(struct worker_pool *pool) { - spin_lock_init(&pool->lock); + raw_spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; @@ -3507,10 +3507,10 @@ static void rcu_free_pool(struct rcu_head *rcu) /* This returns with the lock held on success (pool manager is inactive). */ static bool wq_manager_inactive(struct worker_pool *pool) { - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); if (pool->flags & POOL_MANAGER_ACTIVE) { - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); return false; } return true; @@ -3561,7 +3561,7 @@ static void put_unbound_pool(struct worker_pool *pool) while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_lock(&wq_pool_attach_mutex); if (!list_empty(&pool->workers)) @@ -3717,7 +3717,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) return; /* this function can be called during early boot w/ irq disabled */ - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that @@ -3740,7 +3740,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ @@ -4142,9 +4142,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, use_dfl_pwq: mutex_lock(&wq->mutex); - spin_lock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); - spin_unlock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); @@ -4373,9 +4373,9 @@ void destroy_workqueue(struct workqueue_struct *wq) struct worker *rescuer = wq->rescuer; /* this prevents new queueing */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); wq->rescuer = NULL; - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); @@ -4389,18 +4389,18 @@ void destroy_workqueue(struct workqueue_struct *wq) mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { pr_warn("%s: %s has the following busy pwq\n", __func__, wq->name); show_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); show_workqueue_state(); return; } - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); @@ -4571,10 +4571,10 @@ unsigned int work_busy(struct work_struct *work) rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); } rcu_read_unlock(); @@ -4781,10 +4781,10 @@ void show_workqueue_state(void) pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4798,7 +4798,7 @@ void show_workqueue_state(void) struct worker *worker; bool first = true; - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; @@ -4817,7 +4817,7 @@ void show_workqueue_state(void) } pr_cont("\n"); next_pool: - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4847,7 +4847,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) struct worker_pool *pool = worker->pool; if (pool) { - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * ->desc tracks information (wq name or * set_worker_desc()) for the latest execution. If @@ -4861,7 +4861,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) scnprintf(buf + off, size - off, "-%s", worker->desc); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4892,7 +4892,7 @@ static void unbind_workers(int cpu) for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&wq_pool_attach_mutex); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers @@ -4906,7 +4906,7 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_unlock(&wq_pool_attach_mutex); /* @@ -4932,9 +4932,9 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4961,7 +4961,7 @@ static void rebind_workers(struct worker_pool *pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; @@ -5000,7 +5000,7 @@ static void rebind_workers(struct worker_pool *pool) WRITE_ONCE(worker->flags, worker_flags); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** -- cgit v1.2.3 From 4f3f4cf388f8fda7ee8ea7c6af0ff0ebb2d05fe4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 06:58:59 +0000 Subject: workqueue: void unneeded requeuing the pwq in rescuer thread 008847f66c3 ("workqueue: allow rescuer thread to do more work.") made the rescuer worker requeue the pwq immediately if there may be more work items which need rescuing instead of waiting for the next mayday timer expiration. Unfortunately, it checks only whether the pool needs help from rescuers, but it doesn't check whether the pwq has work items in the pool (the real reason that this rescuer can help for the pool). The patch adds the check and void unneeded requeuing. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82f85f5d81a8..6feefc65d332 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2530,7 +2530,7 @@ repeat: * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ - if (need_to_create_worker(pool)) { + if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction -- cgit v1.2.3 From b8f06b0444ec146e3ae98caac8039c77e5308ce2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 06:59:02 +0000 Subject: workqueue: remove useless unlock() and lock() in series This is no point to unlock() and then lock() the same mutex back to back. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6feefc65d332..c667ca5aed61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4403,13 +4403,11 @@ void destroy_workqueue(struct workqueue_struct *wq) raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - mutex_lock(&wq_pool_mutex); list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); -- cgit v1.2.3 From 18644cec714aabbab173729192c7594ee57a406b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 28 May 2020 21:38:36 -0700 Subject: bpf: Fix use-after-free in fmod_ret check Fix the following issue: [ 436.749342] BUG: KASAN: use-after-free in bpf_trampoline_put+0x39/0x2a0 [ 436.749995] Write of size 4 at addr ffff8881ef38b8a0 by task kworker/3:5/2243 [ 436.750712] [ 436.752677] Workqueue: events bpf_prog_free_deferred [ 436.753183] Call Trace: [ 436.756483] bpf_trampoline_put+0x39/0x2a0 [ 436.756904] bpf_prog_free_deferred+0x16d/0x3d0 [ 436.757377] process_one_work+0x94a/0x15b0 [ 436.761969] [ 436.762130] Allocated by task 2529: [ 436.763323] bpf_trampoline_lookup+0x136/0x540 [ 436.763776] bpf_check+0x2872/0xa0a8 [ 436.764144] bpf_prog_load+0xb6f/0x1350 [ 436.764539] __do_sys_bpf+0x16d7/0x3720 [ 436.765825] [ 436.765988] Freed by task 2529: [ 436.767084] kfree+0xc6/0x280 [ 436.767397] bpf_trampoline_put+0x1fd/0x2a0 [ 436.767826] bpf_check+0x6832/0xa0a8 [ 436.768197] bpf_prog_load+0xb6f/0x1350 [ 436.768594] __do_sys_bpf+0x16d7/0x3720 prog->aux->trampoline = tr should be set only when prog is valid. Otherwise prog freeing will try to put trampoline via prog->aux->trampoline, but it may not point to a valid trampoline. Fixes: 6ba43b761c41 ("bpf: Attachment verification for BPF_MODIFY_RETURN") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200529043839.15824-2-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d7ee40e2748..bfea3f9a972f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10428,22 +10428,13 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(struct bpf_verifier_env *env) +static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) { - struct bpf_prog *prog = env->prog; - unsigned long addr = (unsigned long) prog->aux->trampoline->func.addr; - - /* This is expected to be cleaned up in the future with the KRSI effort - * introducing the LSM_HOOK macro for cleaning up lsm_hooks.h. - */ if (within_error_injection_list(addr) || !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, sizeof(SECURITY_PREFIX) - 1)) return 0; - verbose(env, "fmod_ret attach_btf_id %u (%s) is not modifiable\n", - prog->aux->attach_btf_id, prog->aux->attach_func_name); - return -EINVAL; } @@ -10654,11 +10645,18 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) goto out; } } + + if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + ret = check_attach_modify_return(prog, addr); + if (ret) + verbose(env, "%s() is not modifiable\n", + prog->aux->attach_func_name); + } + + if (ret) + goto out; tr->func.addr = (void *)addr; prog->aux->trampoline = tr; - - if (prog->expected_attach_type == BPF_MODIFY_RETURN) - ret = check_attach_modify_return(env); out: mutex_unlock(&tr->mutex); if (ret) -- cgit v1.2.3 From 3a71dc366d4aa51a22f385445f2862231d3fda3b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 10:28:40 -0700 Subject: bpf: Fix a verifier issue when assigning 32bit reg states to 64bit ones With the latest trunk llvm (llvm 11), I hit a verifier issue for test_prog subtest test_verif_scale1. The following simplified example illustrate the issue: w9 = 0 /* R9_w=inv0 */ r8 = *(u32 *)(r1 + 80) /* __sk_buff->data_end */ r7 = *(u32 *)(r1 + 76) /* __sk_buff->data */ ...... w2 = w9 /* R2_w=inv0 */ r6 = r7 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r6 += r2 /* R6_w=inv(id=0) */ r3 = r6 /* R3_w=inv(id=0) */ r3 += 14 /* R3_w=inv(id=0) */ if r3 > r8 goto end r5 = *(u32 *)(r6 + 0) /* R6_w=inv(id=0) */ <== error here: R6 invalid mem access 'inv' ... end: In real test_verif_scale1 code, "w9 = 0" and "w2 = w9" are in different basic blocks. In the above, after "r6 += r2", r6 becomes a scalar, which eventually caused the memory access error. The correct register state should be a pkt pointer. The inprecise register state starts at "w2 = w9". The 32bit register w9 is 0, in __reg_assign_32_into_64(), the 64bit reg->smax_value is assigned to be U32_MAX. The 64bit reg->smin_value is 0 and the 64bit register itself remains constant based on reg->var_off. In adjust_ptr_min_max_vals(), the verifier checks for a known constant, smin_val must be equal to smax_val. Since they are not equal, the verifier decides r6 is a unknown scalar, which caused later failure. The llvm10 does not have this issue as it generates different code: w9 = 0 /* R9_w=inv0 */ r8 = *(u32 *)(r1 + 80) /* __sk_buff->data_end */ r7 = *(u32 *)(r1 + 76) /* __sk_buff->data */ ...... r6 = r7 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r6 += r9 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r3 = r6 /* R3_w=pkt(id=0,off=0,r=0,imm=0) */ r3 += 14 /* R3_w=pkt(id=0,off=14,r=0,imm=0) */ if r3 > r8 goto end ... To fix the above issue, we can include zero in the test condition for assigning the s32_max_value and s32_min_value to their 64-bit equivalents smax_value and smin_value. Further, fix the condition to avoid doing zero extension bounds checks when s32_min_value <= 0. This could allow for the case where bounds 32-bit bounds (-1,1) get incorrectly translated to (0,1) 64-bit bounds. When in-fact the -1 min value needs to force U32_MAX bound. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159077331983.6014.5758956193749002737.stgit@john-Precision-5820-Tower --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bfea3f9a972f..efe14cf24bc6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1168,14 +1168,14 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) * but must be positive otherwise set to worse case bounds * and refine later from tnum. */ - if (reg->s32_min_value > 0) - reg->smin_value = reg->s32_min_value; - else - reg->smin_value = 0; - if (reg->s32_max_value > 0) + if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) reg->smax_value = reg->s32_max_value; else reg->smax_value = U32_MAX; + if (reg->s32_min_value >= 0) + reg->smin_value = reg->s32_min_value; + else + reg->smin_value = 0; } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) -- cgit v1.2.3 From 6d3cf962dd1a95df868c547b090bfc4c7977f4be Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 May 2020 11:05:43 -0700 Subject: printk: Collapse shutdown types into a single dump reason To turn the KMSG_DUMP_* reasons into a more ordered list, collapse the redundant KMSG_DUMP_(RESTART|HALT|POWEROFF) reasons into KMSG_DUMP_SHUTDOWN. The current users already don't meaningfully distinguish between them, so there's no need to, as discussed here: https://lore.kernel.org/lkml/CA+CK2bAPv5u1ih5y9t5FUnTyximtFCtDYXJCpuyjOyHNOkRdqw@mail.gmail.com/ Link: https://lore.kernel.org/lkml/20200515184434.8470-2-keescook@chromium.org/ Reviewed-by: Pavel Tatashin Reviewed-by: Petr Mladek Acked-by: Michael Ellerman (powerpc) Signed-off-by: Kees Cook --- kernel/reboot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index c4d472b7f1b4..491f1347bf43 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -250,7 +250,7 @@ void kernel_restart(char *cmd) pr_emerg("Restarting system\n"); else pr_emerg("Restarting system with command '%s'\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -274,7 +274,7 @@ void kernel_halt(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("System halted\n"); - kmsg_dump(KMSG_DUMP_HALT); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); @@ -292,7 +292,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); -- cgit v1.2.3 From b1f6f161b236d0e5a9222fb8b482e65aaff13689 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 5 May 2020 11:45:06 -0400 Subject: printk: honor the max_reason field in kmsg_dumper kmsg_dump() allows to dump kmesg buffer for various system events: oops, panic, reboot, etc. It provides an interface to register a callback call for clients, and in that callback interface there is a field "max_reason", but it was getting ignored when set to any "reason" higher than KMSG_DUMP_OOPS unless "always_kmsg_dump" was passed as kernel parameter. Allow clients to actually control their "max_reason", and keep the current behavior when "max_reason" is not set. Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-3-keescook@chromium.org/ Reviewed-by: Petr Mladek Signed-off-by: Kees Cook --- kernel/printk/printk.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..a121c2255737 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3157,12 +3157,19 @@ void kmsg_dump(enum kmsg_dump_reason reason) struct kmsg_dumper *dumper; unsigned long flags; - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { - if (dumper->max_reason && reason > dumper->max_reason) + enum kmsg_dump_reason max_reason = dumper->max_reason; + + /* + * If client has not provided a specific max_reason, default + * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. + */ + if (max_reason == KMSG_DUMP_UNDEF) { + max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : + KMSG_DUMP_OOPS; + } + if (reason > max_reason) continue; /* initialize iterator with data about the stored records */ -- cgit v1.2.3 From fb13cb8a0482105a415e24042209d02a684255e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 May 2020 19:36:22 -0700 Subject: printk: Introduce kmsg_dump_reason_str() The pstore subsystem already had a private version of this function. With the coming addition of the pstore/zone driver, this needs to be shared. As it really should live with printk, move it there instead. Link: https://lore.kernel.org/lkml/20200515184434.8470-4-keescook@chromium.org/ Acked-by: Petr Mladek Acked-by: Sergey Senozhatsky Reviewed-by: Pavel Tatashin Signed-off-by: Kees Cook --- kernel/printk/printk.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a121c2255737..14ca4d05d902 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3144,6 +3144,23 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; + default: + return "Unknown"; + } +} +EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping -- cgit v1.2.3 From 1b94b3aed367ff2cdc84d325e0aa9d7cc9e3cf2a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 3 Apr 2020 14:31:19 -0500 Subject: tracing: Check state.disabled in synth event trace functions Since trace_state.disabled is set in __synth_event_trace_start() at the same time -ENOENT is returned, don't bother returning -ENOENT - just have callers check trace_state.disabled instead, and avoid the extra return val munging. Link: http://lkml.kernel.org/r/87315c3889af870e8370e82b76cf48b426d70130.1585941485.git.zanussi@kernel.org Suggested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fcab11cc6833..5cfe8f998b3e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1845,7 +1845,6 @@ __synth_event_trace_start(struct trace_event_file *file, if (!(file->flags & EVENT_FILE_FL_ENABLED) || trace_trigger_soft_disabled(file)) { trace_state->disabled = true; - ret = -ENOENT; goto out; } @@ -1907,11 +1906,8 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) int ret; ret = __synth_event_trace_start(file, &state); - if (ret) { - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + if (ret || state.disabled) return ret; - } if (n_vals != state.event->n_fields) { ret = -EINVAL; @@ -1987,11 +1983,8 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, int ret; ret = __synth_event_trace_start(file, &state); - if (ret) { - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + if (ret || state.disabled) return ret; - } if (n_vals != state.event->n_fields) { ret = -EINVAL; @@ -2067,16 +2060,10 @@ EXPORT_SYMBOL_GPL(synth_event_trace_array); int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state) { - int ret; - if (!trace_state) return -EINVAL; - ret = __synth_event_trace_start(file, trace_state); - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ - - return ret; + return __synth_event_trace_start(file, trace_state); } EXPORT_SYMBOL_GPL(synth_event_trace_start); -- cgit v1.2.3 From 2d19bd79ae6509858582a9cade739c2e9a4fdca8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 3 Apr 2020 14:31:21 -0500 Subject: tracing: Add hist_debug trace event files for histogram debugging Add a new "hist_debug" file for each trace event, which when read will dump out a bunch of internal details about the hist triggers defined on that event. This is normally off but can be enabled by saying 'y' to the new CONFIG_HIST_TRIGGERS_DEBUG config option. This is in support of the new Documentation file describing histogram internals, Documentation/trace/histogram-design.rst, which was requested by developers trying to understand the internals when extending or making use of the hist triggers for higher-level tools. The histogram-design.rst documentation refers to the hist_debug files and demonstrates their use with output in the test examples. Link: http://lkml.kernel.org/r/77914c22b0ba493d9783c53bbfbc6087d6a7e1b1.1585941485.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 23 ++++ kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 4 + kernel/trace/trace_events_hist.c | 273 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 301 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 743647005f64..2aec9376825f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -847,6 +847,29 @@ config KPROBE_EVENT_GEN_TEST If unsure, say N. +config HIST_TRIGGERS_DEBUG + bool "Hist trigger debug support" + depends on HIST_TRIGGERS + help + Add "hist_debug" file for each event, which when read will + dump out a bunch of internal details about the hist triggers + defined on that event. + + The hist_debug file serves a couple of purposes: + + - Helps developers verify that nothing is broken. + + - Provides educational information to support the details + of the hist trigger internals as described by + Documentation/trace/histogram-design.rst. + + The hist_debug output only covers the data structures + related to the histogram definitions themselves and doesn't + display the internals of map buckets or variable values of + running histograms. + + If unsure, say N. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4eb1d004d5f2..def769df5bf1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1661,6 +1661,7 @@ extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; +extern const struct file_operations event_hist_debug_fops; extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 242f59e7f17d..f6f55682d3e2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2208,6 +2208,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) #ifdef CONFIG_HIST_TRIGGERS trace_create_file("hist", 0444, file->dir, file, &event_hist_fops); +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + trace_create_file("hist_debug", 0444, file->dir, file, + &event_hist_debug_fops); #endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5cfe8f998b3e..313227da4925 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6478,6 +6478,279 @@ const struct file_operations event_hist_fops = { .release = single_release, }; +#ifdef CONFIG_HIST_TRIGGERS_DEBUG +static void hist_field_debug_show_flags(struct seq_file *m, + unsigned long flags) +{ + seq_puts(m, " flags:\n"); + + if (flags & HIST_FIELD_FL_KEY) + seq_puts(m, " HIST_FIELD_FL_KEY\n"); + else if (flags & HIST_FIELD_FL_HITCOUNT) + seq_puts(m, " VAL: HIST_FIELD_FL_HITCOUNT\n"); + else if (flags & HIST_FIELD_FL_VAR) + seq_puts(m, " HIST_FIELD_FL_VAR\n"); + else if (flags & HIST_FIELD_FL_VAR_REF) + seq_puts(m, " HIST_FIELD_FL_VAR_REF\n"); + else + seq_puts(m, " VAL: normal u64 value\n"); + + if (flags & HIST_FIELD_FL_ALIAS) + seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); +} + +static int hist_field_debug_show(struct seq_file *m, + struct hist_field *field, unsigned long flags) +{ + if ((field->flags & flags) != flags) { + seq_printf(m, "ERROR: bad flags - %lx\n", flags); + return -EINVAL; + } + + hist_field_debug_show_flags(m, field->flags); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + + if (field->flags & HIST_FIELD_FL_VAR) { + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + } + + if (field->flags & HIST_FIELD_FL_ALIAS) + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + + if (field->flags & HIST_FIELD_FL_VAR_REF) { + seq_printf(m, " name: %s\n", field->name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + seq_printf(m, " var.hist_data: %p\n", field->var.hist_data); + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + if (field->system) + seq_printf(m, " system: %s\n", field->system); + if (field->event_name) + seq_printf(m, " event_name: %s\n", field->event_name); + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); + + return 0; +} + +static int field_var_debug_show(struct seq_file *m, + struct field_var *field_var, unsigned int i, + bool save_vars) +{ + const char *vars_name = save_vars ? "save_vars" : "field_vars"; + struct hist_field *field; + int ret = 0; + + seq_printf(m, "\n hist_data->%s[%d]:\n", vars_name, i); + + field = field_var->var; + + seq_printf(m, "\n %s[%d].var:\n", vars_name, i); + + hist_field_debug_show_flags(m, field->flags); + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + + field = field_var->val; + + seq_printf(m, "\n %s[%d].val:\n", vars_name, i); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + else { + ret = -EINVAL; + goto out; + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); +out: + return ret; +} + +static int hist_action_debug_show(struct seq_file *m, + struct action_data *data, int i) +{ + int ret = 0; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + seq_printf(m, "\n hist_data->actions[%d].track_data.var_ref:\n", i); + ret = hist_field_debug_show(m, data->track_data.var_ref, + HIST_FIELD_FL_VAR_REF); + if (ret) + goto out; + + seq_printf(m, "\n hist_data->actions[%d].track_data.track_var:\n", i); + ret = hist_field_debug_show(m, data->track_data.track_var, + HIST_FIELD_FL_VAR); + if (ret) + goto out; + } + + if (data->handler == HANDLER_ONMATCH) { + seq_printf(m, "\n hist_data->actions[%d].match_data.event_system: %s\n", + i, data->match_data.event_system); + seq_printf(m, " hist_data->actions[%d].match_data.event: %s\n", + i, data->match_data.event); + } +out: + return ret; +} + +static int hist_actions_debug_show(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + int i, ret = 0; + + if (hist_data->n_actions) + seq_puts(m, "\n action tracking variables (for onmax()/onchange()/onmatch()):\n"); + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *action = hist_data->actions[i]; + + ret = hist_action_debug_show(m, action, i); + if (ret) + goto out; + } + + if (hist_data->n_save_vars) + seq_puts(m, "\n save action variables (save() params):\n"); + + for (i = 0; i < hist_data->n_save_vars; i++) { + ret = field_var_debug_show(m, hist_data->save_vars[i], i, true); + if (ret) + goto out; + } +out: + return ret; +} + +static void hist_trigger_debug_show(struct seq_file *m, + struct event_trigger_data *data, int n) +{ + struct hist_trigger_data *hist_data; + int i, ret; + + if (n > 0) + seq_puts(m, "\n\n"); + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data->ops, data); + seq_puts(m, "#\n\n"); + + hist_data = data->private_data; + + seq_printf(m, "hist_data: %p\n\n", hist_data); + seq_printf(m, " n_vals: %u\n", hist_data->n_vals); + seq_printf(m, " n_keys: %u\n", hist_data->n_keys); + seq_printf(m, " n_fields: %u\n", hist_data->n_fields); + + seq_puts(m, "\n val fields:\n\n"); + + seq_puts(m, " hist_data->fields[0]:\n"); + ret = hist_field_debug_show(m, hist_data->fields[0], + HIST_FIELD_FL_HITCOUNT); + if (ret) + return; + + for (i = 1; i < hist_data->n_vals; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], 0); + if (ret) + return; + } + + seq_puts(m, "\n key fields:\n"); + + for (i = hist_data->n_vals; i < hist_data->n_fields; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], + HIST_FIELD_FL_KEY); + if (ret) + return; + } + + if (hist_data->n_var_refs) + seq_puts(m, "\n variable reference fields:\n"); + + for (i = 0; i < hist_data->n_var_refs; i++) { + seq_printf(m, "\n hist_data->var_refs[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->var_refs[i], + HIST_FIELD_FL_VAR_REF); + if (ret) + return; + } + + if (hist_data->n_field_vars) + seq_puts(m, "\n field variables:\n"); + + for (i = 0; i < hist_data->n_field_vars; i++) { + ret = field_var_debug_show(m, hist_data->field_vars[i], i, false); + if (ret) + return; + } + + ret = hist_actions_debug_show(m, hist_data); + if (ret) + return; +} + +static int hist_debug_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_debug_show(m, data, n++); + } + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_debug_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + return single_open(file, hist_debug_show, file); +} + +const struct file_operations event_hist_debug_fops = { + .open = event_hist_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); -- cgit v1.2.3 From 726721a51838e3983023f906580722fc83f804ee Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 28 May 2020 14:32:37 -0500 Subject: tracing: Move synthetic events to a separate file With the addition of the in-kernel synthetic event API, synthetic events are no longer specifically tied to the histogram triggers. The synthetic event code is also making trace_event_hist.c very bloated, so for those reasons, move it to a separate file, trace_events_synth.c, along with a new trace_synth.h header file. Because synthetic events are now independent from hist triggers, add a new CONFIG_SYNTH_EVENTS config option, and have CONFIG_HIST_TRIGGERS select it, and have CONFIG_SYNTH_EVENT_GEN_TEST depend on it. Link: http://lkml.kernel.org/r/4d1fa1f85ed5982706ac44844ac92451dcb04715.1590693308.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 20 +- kernel/trace/Makefile | 1 + kernel/trace/trace_events_hist.c | 2071 +++---------------------------------- kernel/trace/trace_events_synth.c | 1789 ++++++++++++++++++++++++++++++++ kernel/trace/trace_synth.h | 36 + 5 files changed, 1989 insertions(+), 1928 deletions(-) create mode 100644 kernel/trace/trace_events_synth.c create mode 100644 kernel/trace/trace_synth.h (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2aec9376825f..75407d5dc83a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -623,12 +623,30 @@ config TRACING_MAP generally used outside of that context, and is normally selected by tracers that use it. +config SYNTH_EVENTS + bool "Synthetic trace events" + select TRACING + select DYNAMIC_EVENTS + default n + help + Synthetic events are user-defined trace events that can be + used to combine data from other trace events or in fact any + data source. Synthetic events can be generated indirectly + via the trace() action of histogram triggers or directly + by way of an in-kernel API. + + See Documentation/trace/events.rst or + Documentation/trace/histogram.rst for details and examples. + + If in doubt, say N. + config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP select TRACING select DYNAMIC_EVENTS + select SYNTH_EVENTS default n help Hist triggers allow one or more arbitrary trace event fields @@ -824,7 +842,7 @@ config PREEMPTIRQ_DELAY_TEST config SYNTH_EVENT_GEN_TEST tristate "Test module for in-kernel synthetic event generation" - depends on HIST_TRIGGERS + depends on SYNTH_EVENTS help This option creates a test module to check the base functionality of in-kernel synthetic event definition and diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f9dcd19165fa..1d8aaa546412 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -72,6 +72,7 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o +obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 313227da4925..0b933546142e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -19,13 +19,7 @@ #include #include "tracing_map.h" -#include "trace.h" -#include "trace_dynevent.h" - -#define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 32 - -#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#include "trace_synth.h" #define ERRORS \ C(NONE, "No error"), \ @@ -380,69 +374,6 @@ struct hist_trigger_data { unsigned int n_save_var_str; }; -static int create_synth_event(int argc, const char **argv); -static int synth_event_show(struct seq_file *m, struct dyn_event *ev); -static int synth_event_release(struct dyn_event *ev); -static bool synth_event_is_busy(struct dyn_event *ev); -static bool synth_event_match(const char *system, const char *event, - int argc, const char **argv, struct dyn_event *ev); - -static struct dyn_event_operations synth_event_ops = { - .create = create_synth_event, - .show = synth_event_show, - .is_busy = synth_event_is_busy, - .free = synth_event_release, - .match = synth_event_match, -}; - -struct synth_field { - char *type; - char *name; - size_t size; - unsigned int offset; - bool is_signed; - bool is_string; -}; - -struct synth_event { - struct dyn_event devent; - int ref; - char *name; - struct synth_field **fields; - unsigned int n_fields; - unsigned int n_u64; - struct trace_event_class class; - struct trace_event_call call; - struct tracepoint *tp; - struct module *mod; -}; - -static bool is_synth_event(struct dyn_event *ev) -{ - return ev->ops == &synth_event_ops; -} - -static struct synth_event *to_synth_event(struct dyn_event *ev) -{ - return container_of(ev, struct synth_event, devent); -} - -static bool synth_event_is_busy(struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - - return event->ref != 0; -} - -static bool synth_event_match(const char *system, const char *event, - int argc, const char **argv, struct dyn_event *ev) -{ - struct synth_event *sev = to_synth_event(ev); - - return strcmp(sev->name, event) == 0 && - (!system || strcmp(system, SYNTH_SYSTEM) == 0); -} - struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, @@ -490,1882 +421,202 @@ struct action_data { union { struct { char *event; - char *event_system; - } match_data; - - struct { - /* - * var_str contains the $-unstripped variable - * name referenced by var_ref, and used when - * printing the action. Because var_ref - * creation is deferred to create_actions(), - * we need a per-action way to save it until - * then, thus var_str. - */ - char *var_str; - - /* - * var_ref refers to the variable being - * tracked e.g onmax($var). - */ - struct hist_field *var_ref; - - /* - * track_var contains the 'invisible' tracking - * variable created to keep the current - * e.g. max value. - */ - struct hist_field *track_var; - - check_track_val_fn_t check_val; - action_fn_t save_data; - } track_data; - }; -}; - -struct track_data { - u64 track_val; - bool updated; - - unsigned int key_len; - void *key; - struct tracing_map_elt elt; - - struct action_data *action_data; - struct hist_trigger_data *hist_data; -}; - -struct hist_elt_data { - char *comm; - u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; -}; - -struct snapshot_context { - struct tracing_map_elt *elt; - void *key; -}; - -static void track_data_free(struct track_data *track_data) -{ - struct hist_elt_data *elt_data; - - if (!track_data) - return; - - kfree(track_data->key); - - elt_data = track_data->elt.private_data; - if (elt_data) { - kfree(elt_data->comm); - kfree(elt_data); - } - - kfree(track_data); -} - -static struct track_data *track_data_alloc(unsigned int key_len, - struct action_data *action_data, - struct hist_trigger_data *hist_data) -{ - struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); - struct hist_elt_data *elt_data; - - if (!data) - return ERR_PTR(-ENOMEM); - - data->key = kzalloc(key_len, GFP_KERNEL); - if (!data->key) { - track_data_free(data); - return ERR_PTR(-ENOMEM); - } - - data->key_len = key_len; - data->action_data = action_data; - data->hist_data = hist_data; - - elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); - if (!elt_data) { - track_data_free(data); - return ERR_PTR(-ENOMEM); - } - data->elt.private_data = elt_data; - - elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!elt_data->comm) { - track_data_free(data); - return ERR_PTR(-ENOMEM); - } - - return data; -} - -static char last_cmd[MAX_FILTER_STR_VAL]; -static char last_cmd_loc[MAX_FILTER_STR_VAL]; - -static int errpos(char *str) -{ - return err_pos(last_cmd, str); -} - -static void last_cmd_set(struct trace_event_file *file, char *str) -{ - const char *system = NULL, *name = NULL; - struct trace_event_call *call; - - if (!str) - return; - - strcpy(last_cmd, "hist:"); - strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:")); - - if (file) { - call = file->event_call; - - system = call->class->system; - if (system) { - name = trace_event_name(call); - if (!name) - system = NULL; - } - } - - if (system) - snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); -} - -static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) -{ - tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, - err_type, err_pos); -} - -static void hist_err_clear(void) -{ - last_cmd[0] = '\0'; - last_cmd_loc[0] = '\0'; -} - -struct synth_trace_event { - struct trace_entry ent; - u64 fields[]; -}; - -static int synth_event_define_fields(struct trace_event_call *call) -{ - struct synth_trace_event trace; - int offset = offsetof(typeof(trace), fields); - struct synth_event *event = call->data; - unsigned int i, size, n_u64; - char *name, *type; - bool is_signed; - int ret = 0; - - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - size = event->fields[i]->size; - is_signed = event->fields[i]->is_signed; - type = event->fields[i]->type; - name = event->fields[i]->name; - ret = trace_define_field(call, type, name, offset, size, - is_signed, FILTER_OTHER); - if (ret) - break; - - event->fields[i]->offset = n_u64; - - if (event->fields[i]->is_string) { - offset += STR_VAR_LEN_MAX; - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - offset += sizeof(u64); - n_u64++; - } - } - - event->n_u64 = n_u64; - - return ret; -} - -static bool synth_field_signed(char *type) -{ - if (str_has_prefix(type, "u")) - return false; - if (strcmp(type, "gfp_t") == 0) - return false; - - return true; -} - -static int synth_field_is_string(char *type) -{ - if (strstr(type, "char[") != NULL) - return true; - - return false; -} - -static int synth_field_string_size(char *type) -{ - char buf[4], *end, *start; - unsigned int len; - int size, err; - - start = strstr(type, "char["); - if (start == NULL) - return -EINVAL; - start += sizeof("char[") - 1; - - end = strchr(type, ']'); - if (!end || end < start) - return -EINVAL; - - len = end - start; - if (len > 3) - return -EINVAL; - - strncpy(buf, start, len); - buf[len] = '\0'; - - err = kstrtouint(buf, 0, &size); - if (err) - return err; - - if (size > STR_VAR_LEN_MAX) - return -EINVAL; - - return size; -} - -static int synth_field_size(char *type) -{ - int size = 0; - - if (strcmp(type, "s64") == 0) - size = sizeof(s64); - else if (strcmp(type, "u64") == 0) - size = sizeof(u64); - else if (strcmp(type, "s32") == 0) - size = sizeof(s32); - else if (strcmp(type, "u32") == 0) - size = sizeof(u32); - else if (strcmp(type, "s16") == 0) - size = sizeof(s16); - else if (strcmp(type, "u16") == 0) - size = sizeof(u16); - else if (strcmp(type, "s8") == 0) - size = sizeof(s8); - else if (strcmp(type, "u8") == 0) - size = sizeof(u8); - else if (strcmp(type, "char") == 0) - size = sizeof(char); - else if (strcmp(type, "unsigned char") == 0) - size = sizeof(unsigned char); - else if (strcmp(type, "int") == 0) - size = sizeof(int); - else if (strcmp(type, "unsigned int") == 0) - size = sizeof(unsigned int); - else if (strcmp(type, "long") == 0) - size = sizeof(long); - else if (strcmp(type, "unsigned long") == 0) - size = sizeof(unsigned long); - else if (strcmp(type, "pid_t") == 0) - size = sizeof(pid_t); - else if (strcmp(type, "gfp_t") == 0) - size = sizeof(gfp_t); - else if (synth_field_is_string(type)) - size = synth_field_string_size(type); - - return size; -} - -static const char *synth_field_fmt(char *type) -{ - const char *fmt = "%llu"; - - if (strcmp(type, "s64") == 0) - fmt = "%lld"; - else if (strcmp(type, "u64") == 0) - fmt = "%llu"; - else if (strcmp(type, "s32") == 0) - fmt = "%d"; - else if (strcmp(type, "u32") == 0) - fmt = "%u"; - else if (strcmp(type, "s16") == 0) - fmt = "%d"; - else if (strcmp(type, "u16") == 0) - fmt = "%u"; - else if (strcmp(type, "s8") == 0) - fmt = "%d"; - else if (strcmp(type, "u8") == 0) - fmt = "%u"; - else if (strcmp(type, "char") == 0) - fmt = "%d"; - else if (strcmp(type, "unsigned char") == 0) - fmt = "%u"; - else if (strcmp(type, "int") == 0) - fmt = "%d"; - else if (strcmp(type, "unsigned int") == 0) - fmt = "%u"; - else if (strcmp(type, "long") == 0) - fmt = "%ld"; - else if (strcmp(type, "unsigned long") == 0) - fmt = "%lu"; - else if (strcmp(type, "pid_t") == 0) - fmt = "%d"; - else if (strcmp(type, "gfp_t") == 0) - fmt = "%x"; - else if (synth_field_is_string(type)) - fmt = "%s"; - - return fmt; -} - -static void print_synth_event_num_val(struct trace_seq *s, - char *print_fmt, char *name, - int size, u64 val, char *space) -{ - switch (size) { - case 1: - trace_seq_printf(s, print_fmt, name, (u8)val, space); - break; - - case 2: - trace_seq_printf(s, print_fmt, name, (u16)val, space); - break; - - case 4: - trace_seq_printf(s, print_fmt, name, (u32)val, space); - break; - - default: - trace_seq_printf(s, print_fmt, name, val, space); - break; - } -} - -static enum print_line_t print_synth_event(struct trace_iterator *iter, - int flags, - struct trace_event *event) -{ - struct trace_array *tr = iter->tr; - struct trace_seq *s = &iter->seq; - struct synth_trace_event *entry; - struct synth_event *se; - unsigned int i, n_u64; - char print_fmt[32]; - const char *fmt; - - entry = (struct synth_trace_event *)iter->ent; - se = container_of(event, struct synth_event, call.event); - - trace_seq_printf(s, "%s: ", se->name); - - for (i = 0, n_u64 = 0; i < se->n_fields; i++) { - if (trace_seq_has_overflowed(s)) - goto end; - - fmt = synth_field_fmt(se->fields[i]->type); - - /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) - trace_seq_printf(s, "%s ", fmt); - - snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); - - /* parameter values */ - if (se->fields[i]->is_string) { - trace_seq_printf(s, print_fmt, se->fields[i]->name, - (char *)&entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; - char *space = (i == se->n_fields - 1 ? "" : " "); - - print_synth_event_num_val(s, print_fmt, - se->fields[i]->name, - se->fields[i]->size, - entry->fields[n_u64], - space); - - if (strcmp(se->fields[i]->type, "gfp_t") == 0) { - trace_seq_puts(s, " ("); - trace_print_flags_seq(s, "|", - entry->fields[n_u64], - __flags); - trace_seq_putc(s, ')'); - } - n_u64++; - } - } -end: - trace_seq_putc(s, '\n'); - - return trace_handle_return(s); -} - -static struct trace_event_functions synth_event_funcs = { - .trace = print_synth_event -}; - -static notrace void trace_event_raw_event_synth(void *__data, - u64 *var_ref_vals, - unsigned int *var_ref_idx) -{ - struct trace_event_file *trace_file = __data; - struct synth_trace_event *entry; - struct trace_event_buffer fbuffer; - struct trace_buffer *buffer; - struct synth_event *event; - unsigned int i, n_u64, val_idx; - int fields_size = 0; - - event = trace_file->event_call->data; - - if (trace_trigger_soft_disabled(trace_file)) - return; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - - entry = trace_event_buffer_reserve(&fbuffer, trace_file, - sizeof(*entry) + fields_size); - if (!entry) - goto out; - - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - val_idx = var_ref_idx[i]; - if (event->fields[i]->is_string) { - char *str_val = (char *)(long)var_ref_vals[val_idx]; - char *str_field = (char *)&entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = event->fields[i]; - u64 val = var_ref_vals[val_idx]; - - switch (field->size) { - case 1: - *(u8 *)&entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&entry->fields[n_u64] = (u32)val; - break; - - default: - entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } - - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); -} - -static void free_synth_event_print_fmt(struct trace_event_call *call) -{ - if (call) { - kfree(call->print_fmt); - call->print_fmt = NULL; - } -} - -static int __set_synth_event_print_fmt(struct synth_event *event, - char *buf, int len) -{ - const char *fmt; - int pos = 0; - int i; - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - for (i = 0; i < event->n_fields; i++) { - fmt = synth_field_fmt(event->fields[i]->type); - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", - event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); - } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - - for (i = 0; i < event->n_fields; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", REC->%s", event->fields[i]->name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_synth_event_print_fmt(struct trace_event_call *call) -{ - struct synth_event *event = call->data; - char *print_fmt; - int len; - - /* First: called with 0 length to calculate the needed length */ - len = __set_synth_event_print_fmt(event, NULL, 0); - - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_synth_event_print_fmt(event, print_fmt, len + 1); - call->print_fmt = print_fmt; - - return 0; -} - -static void free_synth_field(struct synth_field *field) -{ - kfree(field->type); - kfree(field->name); - kfree(field); -} - -static struct synth_field *parse_synth_field(int argc, const char **argv, - int *consumed) -{ - struct synth_field *field; - const char *prefix = NULL, *field_type = argv[0], *field_name, *array; - int len, ret = 0; - - if (field_type[0] == ';') - field_type++; - - if (!strcmp(field_type, "unsigned")) { - if (argc < 3) - return ERR_PTR(-EINVAL); - prefix = "unsigned "; - field_type = argv[1]; - field_name = argv[2]; - *consumed = 3; - } else { - field_name = argv[1]; - *consumed = 2; - } - - field = kzalloc(sizeof(*field), GFP_KERNEL); - if (!field) - return ERR_PTR(-ENOMEM); - - len = strlen(field_name); - array = strchr(field_name, '['); - if (array) - len -= strlen(array); - else if (field_name[len - 1] == ';') - len--; - - field->name = kmemdup_nul(field_name, len, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; - goto free; - } - - if (field_type[0] == ';') - field_type++; - len = strlen(field_type) + 1; - if (array) - len += strlen(array); - if (prefix) - len += strlen(prefix); - - field->type = kzalloc(len, GFP_KERNEL); - if (!field->type) { - ret = -ENOMEM; - goto free; - } - if (prefix) - strcat(field->type, prefix); - strcat(field->type, field_type); - if (array) { - strcat(field->type, array); - if (field->type[len - 1] == ';') - field->type[len - 1] = '\0'; - } - - field->size = synth_field_size(field->type); - if (!field->size) { - ret = -EINVAL; - goto free; - } - - if (synth_field_is_string(field->type)) - field->is_string = true; - - field->is_signed = synth_field_signed(field->type); - - out: - return field; - free: - free_synth_field(field); - field = ERR_PTR(ret); - goto out; -} - -static void free_synth_tracepoint(struct tracepoint *tp) -{ - if (!tp) - return; - - kfree(tp->name); - kfree(tp); -} - -static struct tracepoint *alloc_synth_tracepoint(char *name) -{ - struct tracepoint *tp; - - tp = kzalloc(sizeof(*tp), GFP_KERNEL); - if (!tp) - return ERR_PTR(-ENOMEM); - - tp->name = kstrdup(name, GFP_KERNEL); - if (!tp->name) { - kfree(tp); - return ERR_PTR(-ENOMEM); - } - - return tp; -} - -typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, - unsigned int *var_ref_idx); - -static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, - unsigned int *var_ref_idx) -{ - struct tracepoint *tp = event->tp; - - if (unlikely(atomic_read(&tp->key.enabled) > 0)) { - struct tracepoint_func *probe_func_ptr; - synth_probe_func_t probe_func; - void *__data; - - if (!(cpu_online(raw_smp_processor_id()))) - return; - - probe_func_ptr = rcu_dereference_sched((tp)->funcs); - if (probe_func_ptr) { - do { - probe_func = probe_func_ptr->func; - __data = probe_func_ptr->data; - probe_func(__data, var_ref_vals, var_ref_idx); - } while ((++probe_func_ptr)->func); - } - } -} - -static struct synth_event *find_synth_event(const char *name) -{ - struct dyn_event *pos; - struct synth_event *event; - - for_each_dyn_event(pos) { - if (!is_synth_event(pos)) - continue; - event = to_synth_event(pos); - if (strcmp(event->name, name) == 0) - return event; - } - - return NULL; -} - -static struct trace_event_fields synth_event_fields_array[] = { - { .type = TRACE_FUNCTION_TYPE, - .define_fields = synth_event_define_fields }, - {} -}; - -static int register_synth_event(struct synth_event *event) -{ - struct trace_event_call *call = &event->call; - int ret = 0; - - event->call.class = &event->class; - event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); - if (!event->class.system) { - ret = -ENOMEM; - goto out; - } - - event->tp = alloc_synth_tracepoint(event->name); - if (IS_ERR(event->tp)) { - ret = PTR_ERR(event->tp); - event->tp = NULL; - goto out; - } - - INIT_LIST_HEAD(&call->class->fields); - call->event.funcs = &synth_event_funcs; - call->class->fields_array = synth_event_fields_array; - - ret = register_trace_event(&call->event); - if (!ret) { - ret = -ENODEV; - goto out; - } - call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; - call->class->probe = trace_event_raw_event_synth; - call->data = event; - call->tp = event->tp; - - ret = trace_add_event_call(call); - if (ret) { - pr_warn("Failed to register synthetic event: %s\n", - trace_event_name(call)); - goto err; - } - - ret = set_synth_event_print_fmt(call); - if (ret < 0) { - trace_remove_event_call(call); - goto err; - } - out: - return ret; - err: - unregister_trace_event(&call->event); - goto out; -} - -static int unregister_synth_event(struct synth_event *event) -{ - struct trace_event_call *call = &event->call; - int ret; - - ret = trace_remove_event_call(call); - - return ret; -} - -static void free_synth_event(struct synth_event *event) -{ - unsigned int i; - - if (!event) - return; - - for (i = 0; i < event->n_fields; i++) - free_synth_field(event->fields[i]); - - kfree(event->fields); - kfree(event->name); - kfree(event->class.system); - free_synth_tracepoint(event->tp); - free_synth_event_print_fmt(&event->call); - kfree(event); -} - -static struct synth_event *alloc_synth_event(const char *name, int n_fields, - struct synth_field **fields) -{ - struct synth_event *event; - unsigned int i; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) { - event = ERR_PTR(-ENOMEM); - goto out; - } - - event->name = kstrdup(name, GFP_KERNEL); - if (!event->name) { - kfree(event); - event = ERR_PTR(-ENOMEM); - goto out; - } - - event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); - if (!event->fields) { - free_synth_event(event); - event = ERR_PTR(-ENOMEM); - goto out; - } - - dyn_event_init(&event->devent, &synth_event_ops); - - for (i = 0; i < n_fields; i++) - event->fields[i] = fields[i]; - - event->n_fields = n_fields; - out: - return event; -} - -static void action_trace(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, void *key, - struct action_data *data, u64 *var_ref_vals) -{ - struct synth_event *event = data->synth_event; - - trace_synth(event, var_ref_vals, data->var_ref_idx); -} - -struct hist_var_data { - struct list_head list; - struct hist_trigger_data *hist_data; -}; - -static int synth_event_check_arg_fn(void *data) -{ - struct dynevent_arg_pair *arg_pair = data; - int size; - - size = synth_field_size((char *)arg_pair->lhs); - - return size ? 0 : -EINVAL; -} - -/** - * synth_event_add_field - Add a new field to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @type: The type of the new field to add - * @name: The name of the new field to add - * - * Add a new field to a synthetic event cmd object. Field ordering is in - * the same order the fields are added. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, - const char *name) -{ - struct dynevent_arg_pair arg_pair; - int ret; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (!type || !name) - return -EINVAL; - - dynevent_arg_pair_init(&arg_pair, 0, ';'); - - arg_pair.lhs = type; - arg_pair.rhs = name; - - ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); - if (ret) - return ret; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) - ret = -EINVAL; - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_field); - -/** - * synth_event_add_field_str - Add a new field to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @type_name: The type and name of the new field to add, as a single string - * - * Add a new field to a synthetic event cmd object, as a single - * string. The @type_name string is expected to be of the form 'type - * name', which will be appended by ';'. No sanity checking is done - - * what's passed in is assumed to already be well-formed. Field - * ordering is in the same order the fields are added. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) -{ - struct dynevent_arg arg; - int ret; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (!type_name) - return -EINVAL; - - dynevent_arg_init(&arg, ';'); - - arg.str = type_name; - - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) - ret = -EINVAL; - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_field_str); - -/** - * synth_event_add_fields - Add multiple fields to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * - * Add a new set of fields to a synthetic event cmd object. The event - * fields that will be defined for the event should be passed in as an - * array of struct synth_field_desc, and the number of elements in the - * array passed in as n_fields. Field ordering will retain the - * ordering given in the fields array. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_fields(struct dynevent_cmd *cmd, - struct synth_field_desc *fields, - unsigned int n_fields) -{ - unsigned int i; - int ret = 0; - - for (i = 0; i < n_fields; i++) { - if (fields[i].type == NULL || fields[i].name == NULL) { - ret = -EINVAL; - break; - } - - ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_fields); - -/** - * __synth_event_gen_cmd_start - Start a synthetic event command from arg list - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @name: The name of the synthetic event - * @mod: The module creating the event, NULL if not created from a module - * @args: Variable number of arg (pairs), one pair for each field - * - * NOTE: Users normally won't want to call this function directly, but - * rather use the synth_event_gen_cmd_start() wrapper, which - * automatically adds a NULL to the end of the arg list. If this - * function is used directly, make sure the last arg in the variable - * arg list is NULL. - * - * Generate a synthetic event command to be executed by - * synth_event_gen_cmd_end(). This function can be used to generate - * the complete command or only the first part of it; in the latter - * case, synth_event_add_field(), synth_event_add_field_str(), or - * synth_event_add_fields() can be used to add more fields following - * this. - * - * There should be an even number variable args, each pair consisting - * of a type followed by a field name. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, - struct module *mod, ...) -{ - struct dynevent_arg arg; - va_list args; - int ret; - - cmd->event_name = name; - cmd->private_data = mod; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - dynevent_arg_init(&arg, 0); - arg.str = name; - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - va_start(args, mod); - for (;;) { - const char *type, *name; - - type = va_arg(args, const char *); - if (!type) - break; - name = va_arg(args, const char *); - if (!name) - break; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) { - ret = -EINVAL; - break; - } - - ret = synth_event_add_field(cmd, type, name); - if (ret) - break; - } - va_end(args); - - return ret; -} -EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); - -/** - * synth_event_gen_cmd_array_start - Start synthetic event command from an array - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @name: The name of the synthetic event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * - * Generate a synthetic event command to be executed by - * synth_event_gen_cmd_end(). This function can be used to generate - * the complete command or only the first part of it; in the latter - * case, synth_event_add_field(), synth_event_add_field_str(), or - * synth_event_add_fields() can be used to add more fields following - * this. - * - * The event fields that will be defined for the event should be - * passed in as an array of struct synth_field_desc, and the number of - * elements in the array passed in as n_fields. Field ordering will - * retain the ordering given in the fields array. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, - struct module *mod, - struct synth_field_desc *fields, - unsigned int n_fields) -{ - struct dynevent_arg arg; - unsigned int i; - int ret = 0; - - cmd->event_name = name; - cmd->private_data = mod; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (n_fields > SYNTH_FIELDS_MAX) - return -EINVAL; - - dynevent_arg_init(&arg, 0); - arg.str = name; - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - for (i = 0; i < n_fields; i++) { - if (fields[i].type == NULL || fields[i].name == NULL) - return -EINVAL; - - ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); - -static int __create_synth_event(int argc, const char *name, const char **argv) -{ - struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; - struct synth_event *event = NULL; - int i, consumed = 0, n_fields = 0, ret = 0; - - /* - * Argument syntax: - * - Add synthetic event: field[;field] ... - * - Remove synthetic event: ! field[;field] ... - * where 'field' = type field_name - */ - - if (name[0] == '\0' || argc < 1) - return -EINVAL; - - mutex_lock(&event_mutex); - - event = find_synth_event(name); - if (event) { - ret = -EEXIST; - goto out; - } - - for (i = 0; i < argc - 1; i++) { - if (strcmp(argv[i], ";") == 0) - continue; - if (n_fields == SYNTH_FIELDS_MAX) { - ret = -EINVAL; - goto err; - } - - field = parse_synth_field(argc - i, &argv[i], &consumed); - if (IS_ERR(field)) { - ret = PTR_ERR(field); - goto err; - } - fields[n_fields++] = field; - i += consumed - 1; - } - - if (i < argc && strcmp(argv[i], ";") != 0) { - ret = -EINVAL; - goto err; - } - - event = alloc_synth_event(name, n_fields, fields); - if (IS_ERR(event)) { - ret = PTR_ERR(event); - event = NULL; - goto err; - } - ret = register_synth_event(event); - if (!ret) - dyn_event_add(&event->devent); - else - free_synth_event(event); - out: - mutex_unlock(&event_mutex); - - return ret; - err: - for (i = 0; i < n_fields; i++) - free_synth_field(fields[i]); - - goto out; -} - -/** - * synth_event_create - Create a new synthetic event - * @name: The name of the new sythetic event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * @mod: The module creating the event, NULL if not created from a module - * - * Create a new synthetic event with the given name under the - * trace/events/synthetic/ directory. The event fields that will be - * defined for the event should be passed in as an array of struct - * synth_field_desc, and the number elements in the array passed in as - * n_fields. Field ordering will retain the ordering given in the - * fields array. - * - * If the new synthetic event is being created from a module, the mod - * param must be non-NULL. This will ensure that the trace buffer - * won't contain unreadable events. - * - * The new synth event should be deleted using synth_event_delete() - * function. The new synthetic event can be generated from modules or - * other kernel code using trace_synth_event() and related functions. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_create(const char *name, struct synth_field_desc *fields, - unsigned int n_fields, struct module *mod) -{ - struct dynevent_cmd cmd; - char *buf; - int ret; - - buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); - - ret = synth_event_gen_cmd_array_start(&cmd, name, mod, - fields, n_fields); - if (ret) - goto out; - - ret = synth_event_gen_cmd_end(&cmd); - out: - kfree(buf); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_create); - -static int destroy_synth_event(struct synth_event *se) -{ - int ret; - - if (se->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(se); - if (!ret) { - dyn_event_remove(&se->devent); - free_synth_event(se); - } - } - - return ret; -} - -/** - * synth_event_delete - Delete a synthetic event - * @event_name: The name of the new sythetic event - * - * Delete a synthetic event that was created with synth_event_create(). - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_delete(const char *event_name) -{ - struct synth_event *se = NULL; - struct module *mod = NULL; - int ret = -ENOENT; - - mutex_lock(&event_mutex); - se = find_synth_event(event_name); - if (se) { - mod = se->mod; - ret = destroy_synth_event(se); - } - mutex_unlock(&event_mutex); - - if (mod) { - mutex_lock(&trace_types_lock); - /* - * It is safest to reset the ring buffer if the module - * being unloaded registered any events that were - * used. The only worry is if a new module gets - * loaded, and takes on the same id as the events of - * this module. When printing out the buffer, traced - * events left over from this module may be passed to - * the new module events and unexpected results may - * occur. - */ - tracing_reset_all_online_cpus(); - mutex_unlock(&trace_types_lock); - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_delete); - -static int create_or_delete_synth_event(int argc, char **argv) -{ - const char *name = argv[0]; - int ret; - - /* trace_run_command() ensures argc != 0 */ - if (name[0] == '!') { - ret = synth_event_delete(name + 1); - return ret; - } - - ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); - return ret == -ECANCELED ? -EINVAL : ret; -} - -static int synth_event_run_command(struct dynevent_cmd *cmd) -{ - struct synth_event *se; - int ret; - - ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); - if (ret) - return ret; - - se = find_synth_event(cmd->event_name); - if (WARN_ON(!se)) - return -ENOENT; - - se->mod = cmd->private_data; - - return ret; -} - -/** - * synth_event_cmd_init - Initialize a synthetic event command object - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @buf: A pointer to the buffer used to build the command - * @maxlen: The length of the buffer passed in @buf - * - * Initialize a synthetic event command object. Use this before - * calling any of the other dyenvent_cmd functions. - */ -void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) -{ - dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, - synth_event_run_command); -} -EXPORT_SYMBOL_GPL(synth_event_cmd_init); - -static inline int -__synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) -{ - int entry_size, fields_size = 0; - int ret = 0; - - memset(trace_state, '\0', sizeof(*trace_state)); - - /* - * Normal event tracing doesn't get called at all unless the - * ENABLED bit is set (which attaches the probe thus allowing - * this code to be called, etc). Because this is called - * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated - * trace case, we save the enabed state upon start and just - * ignore the following data calls. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) { - trace_state->disabled = true; - goto out; - } - - trace_state->event = file->event_call->data; - - fields_size = trace_state->event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - trace_state->buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(trace_state->buffer); - - entry_size = sizeof(*trace_state->entry) + fields_size; - trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, - file, - entry_size); - if (!trace_state->entry) { - ring_buffer_nest_end(trace_state->buffer); - ret = -EINVAL; - } -out: - return ret; -} - -static inline void -__synth_event_trace_end(struct synth_event_trace_state *trace_state) -{ - trace_event_buffer_commit(&trace_state->fbuffer); - - ring_buffer_nest_end(trace_state->buffer); -} - -/** - * synth_event_trace - Trace a synthetic event - * @file: The trace_event_file representing the synthetic event - * @n_vals: The number of values in vals - * @args: Variable number of args containing the event values - * - * Trace a synthetic event using the values passed in the variable - * argument list. - * - * The argument list should be a list 'n_vals' u64 values. The number - * of vals must match the number of field in the synthetic event, and - * must be in the same order as the synthetic event fields. - * - * All vals should be cast to u64, and string vals are just pointers - * to strings, cast to u64. Strings will be copied into space - * reserved in the event for the string, using these pointers. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) -{ - struct synth_event_trace_state state; - unsigned int i, n_u64; - va_list args; - int ret; - - ret = __synth_event_trace_start(file, &state); - if (ret || state.disabled) - return ret; - - if (n_vals != state.event->n_fields) { - ret = -EINVAL; - goto out; - } - - va_start(args, n_vals); - for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { - u64 val; - - val = va_arg(args, u64); - - if (state.event->fields[i]->is_string) { - char *str_val = (char *)(long)val; - char *str_field = (char *)&state.entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = state.event->fields[i]; - - switch (field->size) { - case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; - break; - - default: - state.entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } - va_end(args); -out: - __synth_event_trace_end(&state); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace); - -/** - * synth_event_trace_array - Trace a synthetic event from an array - * @file: The trace_event_file representing the synthetic event - * @vals: Array of values - * @n_vals: The number of values in vals - * - * Trace a synthetic event using the values passed in as 'vals'. - * - * The 'vals' array is just an array of 'n_vals' u64. The number of - * vals must match the number of field in the synthetic event, and - * must be in the same order as the synthetic event fields. - * - * All vals should be cast to u64, and string vals are just pointers - * to strings, cast to u64. Strings will be copied into space - * reserved in the event for the string, using these pointers. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_array(struct trace_event_file *file, u64 *vals, - unsigned int n_vals) -{ - struct synth_event_trace_state state; - unsigned int i, n_u64; - int ret; - - ret = __synth_event_trace_start(file, &state); - if (ret || state.disabled) - return ret; - - if (n_vals != state.event->n_fields) { - ret = -EINVAL; - goto out; - } - - for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { - if (state.event->fields[i]->is_string) { - char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&state.entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = state.event->fields[i]; - u64 val = vals[i]; - - switch (field->size) { - case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; - break; - - default: - state.entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } -out: - __synth_event_trace_end(&state); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace_array); - -/** - * synth_event_trace_start - Start piecewise synthetic event trace - * @file: The trace_event_file representing the synthetic event - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Start the trace of a synthetic event field-by-field rather than all - * at once. - * - * This function 'opens' an event trace, which means space is reserved - * for the event in the trace buffer, after which the event's - * individual field values can be set through either - * synth_event_add_next_val() or synth_event_add_val(). - * - * A pointer to a trace_state object is passed in, which will keep - * track of the current event trace state until the event trace is - * closed (and the event finally traced) using - * synth_event_trace_end(). - * - * Note that synth_event_trace_end() must be called after all values - * have been added for each event trace, regardless of whether adding - * all field values succeeded or not. - * - * Note also that for a given event trace, all fields must be added - * using either synth_event_add_next_val() or synth_event_add_val() - * but not both together or interleaved. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) -{ - if (!trace_state) - return -EINVAL; + char *event_system; + } match_data; - return __synth_event_trace_start(file, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_trace_start); + struct { + /* + * var_str contains the $-unstripped variable + * name referenced by var_ref, and used when + * printing the action. Because var_ref + * creation is deferred to create_actions(), + * we need a per-action way to save it until + * then, thus var_str. + */ + char *var_str; -static int __synth_event_add_val(const char *field_name, u64 val, - struct synth_event_trace_state *trace_state) -{ - struct synth_field *field = NULL; - struct synth_trace_event *entry; - struct synth_event *event; - int i, ret = 0; + /* + * var_ref refers to the variable being + * tracked e.g onmax($var). + */ + struct hist_field *var_ref; - if (!trace_state) { - ret = -EINVAL; - goto out; - } + /* + * track_var contains the 'invisible' tracking + * variable created to keep the current + * e.g. max value. + */ + struct hist_field *track_var; - /* can't mix add_next_synth_val() with add_synth_val() */ - if (field_name) { - if (trace_state->add_next) { - ret = -EINVAL; - goto out; - } - trace_state->add_name = true; - } else { - if (trace_state->add_name) { - ret = -EINVAL; - goto out; - } - trace_state->add_next = true; - } + check_track_val_fn_t check_val; + action_fn_t save_data; + } track_data; + }; +}; - if (trace_state->disabled) - goto out; +struct track_data { + u64 track_val; + bool updated; - event = trace_state->event; - if (trace_state->add_name) { - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; - if (strcmp(field->name, field_name) == 0) - break; - } - if (!field) { - ret = -EINVAL; - goto out; - } - } else { - if (trace_state->cur_field >= event->n_fields) { - ret = -EINVAL; - goto out; - } - field = event->fields[trace_state->cur_field++]; - } + unsigned int key_len; + void *key; + struct tracing_map_elt elt; - entry = trace_state->entry; - if (field->is_string) { - char *str_val = (char *)(long)val; - char *str_field; + struct action_data *action_data; + struct hist_trigger_data *hist_data; +}; - if (!str_val) { - ret = -EINVAL; - goto out; - } +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; +}; - str_field = (char *)&entry->fields[field->offset]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else { - switch (field->size) { - case 1: - *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; - break; +struct snapshot_context { + struct tracing_map_elt *elt; + void *key; +}; - case 2: - *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; - break; +static void track_data_free(struct track_data *track_data) +{ + struct hist_elt_data *elt_data; - case 4: - *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; - break; + if (!track_data) + return; - default: - trace_state->entry->fields[field->offset] = val; - break; - } + kfree(track_data->key); + + elt_data = track_data->elt.private_data; + if (elt_data) { + kfree(elt_data->comm); + kfree(elt_data); } - out: - return ret; -} -/** - * synth_event_add_next_val - Add the next field's value to an open synth trace - * @val: The value to set the next field to - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Set the value of the next field in an event that's been opened by - * synth_event_trace_start(). - * - * The val param should be the value cast to u64. If the value points - * to a string, the val param should be a char * cast to u64. - * - * This function assumes all the fields in an event are to be set one - * after another - successive calls to this function are made, one for - * each field, in the order of the fields in the event, until all - * fields have been set. If you'd rather set each field individually - * without regard to ordering, synth_event_add_val() can be used - * instead. - * - * Note however that synth_event_add_next_val() and - * synth_event_add_val() can't be intermixed for a given event trace - - * one or the other but not both can be used at the same time. - * - * Note also that synth_event_trace_end() must be called after all - * values have been added for each event trace, regardless of whether - * adding all field values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_add_next_val(u64 val, - struct synth_event_trace_state *trace_state) -{ - return __synth_event_add_val(NULL, val, trace_state); + kfree(track_data); } -EXPORT_SYMBOL_GPL(synth_event_add_next_val); -/** - * synth_event_add_val - Add a named field's value to an open synth trace - * @field_name: The name of the synthetic event field value to set - * @val: The value to set the next field to - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Set the value of the named field in an event that's been opened by - * synth_event_trace_start(). - * - * The val param should be the value cast to u64. If the value points - * to a string, the val param should be a char * cast to u64. - * - * This function looks up the field name, and if found, sets the field - * to the specified value. This lookup makes this function more - * expensive than synth_event_add_next_val(), so use that or the - * none-piecewise synth_event_trace() instead if efficiency is more - * important. - * - * Note however that synth_event_add_next_val() and - * synth_event_add_val() can't be intermixed for a given event trace - - * one or the other but not both can be used at the same time. - * - * Note also that synth_event_trace_end() must be called after all - * values have been added for each event trace, regardless of whether - * adding all field values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_add_val(const char *field_name, u64 val, - struct synth_event_trace_state *trace_state) +static struct track_data *track_data_alloc(unsigned int key_len, + struct action_data *action_data, + struct hist_trigger_data *hist_data) { - return __synth_event_add_val(field_name, val, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_add_val); + struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + struct hist_elt_data *elt_data; -/** - * synth_event_trace_end - End piecewise synthetic event trace - * @trace_state: A pointer to object tracking the piecewise trace state - * - * End the trace of a synthetic event opened by - * synth_event_trace__start(). - * - * This function 'closes' an event trace, which basically means that - * it commits the reserved event and cleans up other loose ends. - * - * A pointer to a trace_state object is passed in, which will keep - * track of the current event trace state opened with - * synth_event_trace_start(). - * - * Note that this function must be called after all values have been - * added for each event trace, regardless of whether adding all field - * values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_end(struct synth_event_trace_state *trace_state) -{ - if (!trace_state) - return -EINVAL; + if (!data) + return ERR_PTR(-ENOMEM); - __synth_event_trace_end(trace_state); + data->key = kzalloc(key_len, GFP_KERNEL); + if (!data->key) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } - return 0; -} -EXPORT_SYMBOL_GPL(synth_event_trace_end); + data->key_len = key_len; + data->action_data = action_data; + data->hist_data = hist_data; -static int create_synth_event(int argc, const char **argv) -{ - const char *name = argv[0]; - int len; + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; + data->elt.private_data = elt_data; - /* This interface accepts group name prefix */ - if (strchr(name, '/')) { - len = str_has_prefix(name, SYNTH_SYSTEM "/"); - if (len == 0) - return -EINVAL; - name += len; + elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!elt_data->comm) { + track_data_free(data); + return ERR_PTR(-ENOMEM); } - return __create_synth_event(argc - 1, name, argv + 1); -} - -static int synth_event_release(struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - int ret; - if (event->ref) - return -EBUSY; + return data; +} - ret = unregister_synth_event(event); - if (ret) - return ret; +static char last_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd_loc[MAX_FILTER_STR_VAL]; - dyn_event_remove(ev); - free_synth_event(event); - return 0; +static int errpos(char *str) +{ + return err_pos(last_cmd, str); } -static int __synth_event_show(struct seq_file *m, struct synth_event *event) +static void last_cmd_set(struct trace_event_file *file, char *str) { - struct synth_field *field; - unsigned int i; + const char *system = NULL, *name = NULL; + struct trace_event_call *call; - seq_printf(m, "%s\t", event->name); + if (!str) + return; - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; + strcpy(last_cmd, "hist:"); + strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:")); - /* parameter values */ - seq_printf(m, "%s %s%s", field->type, field->name, - i == event->n_fields - 1 ? "" : "; "); + if (file) { + call = file->event_call; + system = call->class->system; + if (system) { + name = trace_event_name(call); + if (!name) + system = NULL; + } } - seq_putc(m, '\n'); - - return 0; + if (system) + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } -static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) { - struct synth_event *event = to_synth_event(ev); - - seq_printf(m, "s:%s/", event->class.system); - - return __synth_event_show(m, event); + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, + err_type, err_pos); } -static int synth_events_seq_show(struct seq_file *m, void *v) +static void hist_err_clear(void) { - struct dyn_event *ev = v; - - if (!is_synth_event(ev)) - return 0; - - return __synth_event_show(m, to_synth_event(ev)); + last_cmd[0] = '\0'; + last_cmd_loc[0] = '\0'; } -static const struct seq_operations synth_events_seq_op = { - .start = dyn_event_seq_start, - .next = dyn_event_seq_next, - .stop = dyn_event_seq_stop, - .show = synth_events_seq_show, -}; +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, + unsigned int *var_ref_idx); -static int synth_events_open(struct inode *inode, struct file *file) +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, + unsigned int *var_ref_idx) { - int ret; + struct tracepoint *tp = event->tp; - ret = security_locked_down(LOCKDOWN_TRACEFS); - if (ret) - return ret; + if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + synth_probe_func_t probe_func; + void *__data; - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = dyn_events_release_all(&synth_event_ops); - if (ret < 0) - return ret; - } + if (!(cpu_online(raw_smp_processor_id()))) + return; - return seq_open(file, &synth_events_seq_op); + probe_func_ptr = rcu_dereference_sched((tp)->funcs); + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + __data = probe_func_ptr->data; + probe_func(__data, var_ref_vals, var_ref_idx); + } while ((++probe_func_ptr)->func); + } + } } -static ssize_t synth_events_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) +static void action_trace(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - return trace_parse_run_command(file, buffer, count, ppos, - create_or_delete_synth_event); + struct synth_event *event = data->synth_event; + + trace_synth(event, var_ref_vals, data->var_ref_idx); } -static const struct file_operations synth_events_fops = { - .open = synth_events_open, - .write = synth_events_write, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +struct hist_var_data { + struct list_head list; + struct hist_trigger_data *hist_data; }; static u64 hist_field_timestamp(struct hist_field *hist_field, @@ -7653,37 +5904,3 @@ __init int register_trigger_hist_enable_disable_cmds(void) return ret; } - -static __init int trace_events_hist_init(void) -{ - struct dentry *entry = NULL; - struct dentry *d_tracer; - int err = 0; - - err = dyn_event_register(&synth_event_ops); - if (err) { - pr_warn("Could not register synth_event_ops\n"); - return err; - } - - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) { - err = PTR_ERR(d_tracer); - goto err; - } - - entry = tracefs_create_file("synthetic_events", 0644, d_tracer, - NULL, &synth_events_fops); - if (!entry) { - err = -ENODEV; - goto err; - } - - return err; - err: - pr_warn("Could not create tracefs 'synthetic_events' entry\n"); - - return err; -} - -fs_initcall(trace_events_hist_init); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c new file mode 100644 index 000000000000..c6cca0d1d584 --- /dev/null +++ b/kernel/trace/trace_events_synth.c @@ -0,0 +1,1789 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_synth - synthetic trace events + * + * Copyright (C) 2015, 2020 Tom Zanussi + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* for gfp flag names */ +#include +#include + +#include "trace_synth.h" + +static int create_synth_event(int argc, const char **argv); +static int synth_event_show(struct seq_file *m, struct dyn_event *ev); +static int synth_event_release(struct dyn_event *ev); +static bool synth_event_is_busy(struct dyn_event *ev); +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations synth_event_ops = { + .create = create_synth_event, + .show = synth_event_show, + .is_busy = synth_event_is_busy, + .free = synth_event_release, + .match = synth_event_match, +}; + +static bool is_synth_event(struct dyn_event *ev) +{ + return ev->ops == &synth_event_ops; +} + +static struct synth_event *to_synth_event(struct dyn_event *ev) +{ + return container_of(ev, struct synth_event, devent); +} + +static bool synth_event_is_busy(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + return event->ref != 0; +} + +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct synth_event *sev = to_synth_event(ev); + + return strcmp(sev->name, event) == 0 && + (!system || strcmp(system, SYNTH_SYSTEM) == 0); +} + +struct synth_trace_event { + struct trace_entry ent; + u64 fields[]; +}; + +static int synth_event_define_fields(struct trace_event_call *call) +{ + struct synth_trace_event trace; + int offset = offsetof(typeof(trace), fields); + struct synth_event *event = call->data; + unsigned int i, size, n_u64; + char *name, *type; + bool is_signed; + int ret = 0; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + size = event->fields[i]->size; + is_signed = event->fields[i]->is_signed; + type = event->fields[i]->type; + name = event->fields[i]->name; + ret = trace_define_field(call, type, name, offset, size, + is_signed, FILTER_OTHER); + if (ret) + break; + + event->fields[i]->offset = n_u64; + + if (event->fields[i]->is_string) { + offset += STR_VAR_LEN_MAX; + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + offset += sizeof(u64); + n_u64++; + } + } + + event->n_u64 = n_u64; + + return ret; +} + +static bool synth_field_signed(char *type) +{ + if (str_has_prefix(type, "u")) + return false; + if (strcmp(type, "gfp_t") == 0) + return false; + + return true; +} + +static int synth_field_is_string(char *type) +{ + if (strstr(type, "char[") != NULL) + return true; + + return false; +} + +static int synth_field_string_size(char *type) +{ + char buf[4], *end, *start; + unsigned int len; + int size, err; + + start = strstr(type, "char["); + if (start == NULL) + return -EINVAL; + start += sizeof("char[") - 1; + + end = strchr(type, ']'); + if (!end || end < start) + return -EINVAL; + + len = end - start; + if (len > 3) + return -EINVAL; + + strncpy(buf, start, len); + buf[len] = '\0'; + + err = kstrtouint(buf, 0, &size); + if (err) + return err; + + if (size > STR_VAR_LEN_MAX) + return -EINVAL; + + return size; +} + +static int synth_field_size(char *type) +{ + int size = 0; + + if (strcmp(type, "s64") == 0) + size = sizeof(s64); + else if (strcmp(type, "u64") == 0) + size = sizeof(u64); + else if (strcmp(type, "s32") == 0) + size = sizeof(s32); + else if (strcmp(type, "u32") == 0) + size = sizeof(u32); + else if (strcmp(type, "s16") == 0) + size = sizeof(s16); + else if (strcmp(type, "u16") == 0) + size = sizeof(u16); + else if (strcmp(type, "s8") == 0) + size = sizeof(s8); + else if (strcmp(type, "u8") == 0) + size = sizeof(u8); + else if (strcmp(type, "char") == 0) + size = sizeof(char); + else if (strcmp(type, "unsigned char") == 0) + size = sizeof(unsigned char); + else if (strcmp(type, "int") == 0) + size = sizeof(int); + else if (strcmp(type, "unsigned int") == 0) + size = sizeof(unsigned int); + else if (strcmp(type, "long") == 0) + size = sizeof(long); + else if (strcmp(type, "unsigned long") == 0) + size = sizeof(unsigned long); + else if (strcmp(type, "pid_t") == 0) + size = sizeof(pid_t); + else if (strcmp(type, "gfp_t") == 0) + size = sizeof(gfp_t); + else if (synth_field_is_string(type)) + size = synth_field_string_size(type); + + return size; +} + +static const char *synth_field_fmt(char *type) +{ + const char *fmt = "%llu"; + + if (strcmp(type, "s64") == 0) + fmt = "%lld"; + else if (strcmp(type, "u64") == 0) + fmt = "%llu"; + else if (strcmp(type, "s32") == 0) + fmt = "%d"; + else if (strcmp(type, "u32") == 0) + fmt = "%u"; + else if (strcmp(type, "s16") == 0) + fmt = "%d"; + else if (strcmp(type, "u16") == 0) + fmt = "%u"; + else if (strcmp(type, "s8") == 0) + fmt = "%d"; + else if (strcmp(type, "u8") == 0) + fmt = "%u"; + else if (strcmp(type, "char") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned char") == 0) + fmt = "%u"; + else if (strcmp(type, "int") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned int") == 0) + fmt = "%u"; + else if (strcmp(type, "long") == 0) + fmt = "%ld"; + else if (strcmp(type, "unsigned long") == 0) + fmt = "%lu"; + else if (strcmp(type, "pid_t") == 0) + fmt = "%d"; + else if (strcmp(type, "gfp_t") == 0) + fmt = "%x"; + else if (synth_field_is_string(type)) + fmt = "%s"; + + return fmt; +} + +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + +static enum print_line_t print_synth_event(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct synth_trace_event *entry; + struct synth_event *se; + unsigned int i, n_u64; + char print_fmt[32]; + const char *fmt; + + entry = (struct synth_trace_event *)iter->ent; + se = container_of(event, struct synth_event, call.event); + + trace_seq_printf(s, "%s: ", se->name); + + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { + if (trace_seq_has_overflowed(s)) + goto end; + + fmt = synth_field_fmt(se->fields[i]->type); + + /* parameter types */ + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", fmt); + + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); + + /* parameter values */ + if (se->fields[i]->is_string) { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct trace_print_flags __flags[] = { + __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); + + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); + + if (strcmp(se->fields[i]->type, "gfp_t") == 0) { + trace_seq_puts(s, " ("); + trace_print_flags_seq(s, "|", + entry->fields[n_u64], + __flags); + trace_seq_putc(s, ')'); + } + n_u64++; + } + } +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static struct trace_event_functions synth_event_funcs = { + .trace = print_synth_event +}; + +static notrace void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct trace_buffer *buffer; + struct synth_event *event; + unsigned int i, n_u64, val_idx; + int fields_size = 0; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + goto out; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + val_idx = var_ref_idx[i]; + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)var_ref_vals[val_idx]; + char *str_field = (char *)&entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = event->fields[i]; + u64 val = var_ref_vals[val_idx]; + + switch (field->size) { + case 1: + *(u8 *)&entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&entry->fields[n_u64] = (u32)val; + break; + + default: + entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); +} + +static void free_synth_event_print_fmt(struct trace_event_call *call) +{ + if (call) { + kfree(call->print_fmt); + call->print_fmt = NULL; + } +} + +static int __set_synth_event_print_fmt(struct synth_event *event, + char *buf, int len) +{ + const char *fmt; + int pos = 0; + int i; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < event->n_fields; i++) { + fmt = synth_field_fmt(event->fields[i]->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", + event->fields[i]->name, fmt, + i == event->n_fields - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < event->n_fields; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_synth_event_print_fmt(struct trace_event_call *call) +{ + struct synth_event *event = call->data; + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_synth_event_print_fmt(event, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_synth_event_print_fmt(event, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_synth_field(struct synth_field *field) +{ + kfree(field->type); + kfree(field->name); + kfree(field); +} + +static struct synth_field *parse_synth_field(int argc, const char **argv, + int *consumed) +{ + struct synth_field *field; + const char *prefix = NULL, *field_type = argv[0], *field_name, *array; + int len, ret = 0; + + if (field_type[0] == ';') + field_type++; + + if (!strcmp(field_type, "unsigned")) { + if (argc < 3) + return ERR_PTR(-EINVAL); + prefix = "unsigned "; + field_type = argv[1]; + field_name = argv[2]; + *consumed = 3; + } else { + field_name = argv[1]; + *consumed = 2; + } + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + return ERR_PTR(-ENOMEM); + + len = strlen(field_name); + array = strchr(field_name, '['); + if (array) + len -= strlen(array); + else if (field_name[len - 1] == ';') + len--; + + field->name = kmemdup_nul(field_name, len, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + + if (field_type[0] == ';') + field_type++; + len = strlen(field_type) + 1; + if (array) + len += strlen(array); + if (prefix) + len += strlen(prefix); + + field->type = kzalloc(len, GFP_KERNEL); + if (!field->type) { + ret = -ENOMEM; + goto free; + } + if (prefix) + strcat(field->type, prefix); + strcat(field->type, field_type); + if (array) { + strcat(field->type, array); + if (field->type[len - 1] == ';') + field->type[len - 1] = '\0'; + } + + field->size = synth_field_size(field->type); + if (!field->size) { + ret = -EINVAL; + goto free; + } + + if (synth_field_is_string(field->type)) + field->is_string = true; + + field->is_signed = synth_field_signed(field->type); + + out: + return field; + free: + free_synth_field(field); + field = ERR_PTR(ret); + goto out; +} + +static void free_synth_tracepoint(struct tracepoint *tp) +{ + if (!tp) + return; + + kfree(tp->name); + kfree(tp); +} + +static struct tracepoint *alloc_synth_tracepoint(char *name) +{ + struct tracepoint *tp; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + tp->name = kstrdup(name, GFP_KERNEL); + if (!tp->name) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + + return tp; +} + +struct synth_event *find_synth_event(const char *name) +{ + struct dyn_event *pos; + struct synth_event *event; + + for_each_dyn_event(pos) { + if (!is_synth_event(pos)) + continue; + event = to_synth_event(pos); + if (strcmp(event->name, name) == 0) + return event; + } + + return NULL; +} + +static struct trace_event_fields synth_event_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = synth_event_define_fields }, + {} +}; + +static int register_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret = 0; + + event->call.class = &event->class; + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); + if (!event->class.system) { + ret = -ENOMEM; + goto out; + } + + event->tp = alloc_synth_tracepoint(event->name); + if (IS_ERR(event->tp)) { + ret = PTR_ERR(event->tp); + event->tp = NULL; + goto out; + } + + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &synth_event_funcs; + call->class->fields_array = synth_event_fields_array; + + ret = register_trace_event(&call->event); + if (!ret) { + ret = -ENODEV; + goto out; + } + call->flags = TRACE_EVENT_FL_TRACEPOINT; + call->class->reg = trace_event_reg; + call->class->probe = trace_event_raw_event_synth; + call->data = event; + call->tp = event->tp; + + ret = trace_add_event_call(call); + if (ret) { + pr_warn("Failed to register synthetic event: %s\n", + trace_event_name(call)); + goto err; + } + + ret = set_synth_event_print_fmt(call); + if (ret < 0) { + trace_remove_event_call(call); + goto err; + } + out: + return ret; + err: + unregister_trace_event(&call->event); + goto out; +} + +static int unregister_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret; + + ret = trace_remove_event_call(call); + + return ret; +} + +static void free_synth_event(struct synth_event *event) +{ + unsigned int i; + + if (!event) + return; + + for (i = 0; i < event->n_fields; i++) + free_synth_field(event->fields[i]); + + kfree(event->fields); + kfree(event->name); + kfree(event->class.system); + free_synth_tracepoint(event->tp); + free_synth_event_print_fmt(&event->call); + kfree(event); +} + +static struct synth_event *alloc_synth_event(const char *name, int n_fields, + struct synth_field **fields) +{ + struct synth_event *event; + unsigned int i; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) { + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->name = kstrdup(name, GFP_KERNEL); + if (!event->name) { + kfree(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + if (!event->fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + dyn_event_init(&event->devent, &synth_event_ops); + + for (i = 0; i < n_fields; i++) + event->fields[i] = fields[i]; + + event->n_fields = n_fields; + out: + return event; +} + +static int synth_event_check_arg_fn(void *data) +{ + struct dynevent_arg_pair *arg_pair = data; + int size; + + size = synth_field_size((char *)arg_pair->lhs); + + return size ? 0 : -EINVAL; +} + +/** + * synth_event_add_field - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type: The type of the new field to add + * @name: The name of the new field to add + * + * Add a new field to a synthetic event cmd object. Field ordering is in + * the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, + const char *name) +{ + struct dynevent_arg_pair arg_pair; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type || !name) + return -EINVAL; + + dynevent_arg_pair_init(&arg_pair, 0, ';'); + + arg_pair.lhs = type; + arg_pair.rhs = name; + + ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field); + +/** + * synth_event_add_field_str - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type_name: The type and name of the new field to add, as a single string + * + * Add a new field to a synthetic event cmd object, as a single + * string. The @type_name string is expected to be of the form 'type + * name', which will be appended by ';'. No sanity checking is done - + * what's passed in is assumed to already be well-formed. Field + * ordering is in the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) +{ + struct dynevent_arg arg; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type_name) + return -EINVAL; + + dynevent_arg_init(&arg, ';'); + + arg.str = type_name; + + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field_str); + +/** + * synth_event_add_fields - Add multiple fields to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Add a new set of fields to a synthetic event cmd object. The event + * fields that will be defined for the event should be passed in as an + * array of struct synth_field_desc, and the number of elements in the + * array passed in as n_fields. Field ordering will retain the + * ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_fields(struct dynevent_cmd *cmd, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_fields); + +/** + * __synth_event_gen_cmd_start - Start a synthetic event command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module + * @args: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the synth_event_gen_cmd_start() wrapper, which + * automatically adds a NULL to the end of the arg list. If this + * function is used directly, make sure the last arg in the variable + * arg list is NULL. + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * There should be an even number variable args, each pair consisting + * of a type followed by a field name. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, ...) +{ + struct dynevent_arg arg; + va_list args; + int ret; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + va_start(args, mod); + for (;;) { + const char *type, *name; + + type = va_arg(args, const char *); + if (!type) + break; + name = va_arg(args, const char *); + if (!name) + break; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, type, name); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); + +/** + * synth_event_gen_cmd_array_start - Start synthetic event command from an array + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * The event fields that will be defined for the event should be + * passed in as an array of struct synth_field_desc, and the number of + * elements in the array passed in as n_fields. Field ordering will + * retain the ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + struct dynevent_arg arg; + unsigned int i; + int ret = 0; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (n_fields > SYNTH_FIELDS_MAX) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) + return -EINVAL; + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); + +static int __create_synth_event(int argc, const char *name, const char **argv) +{ + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + struct synth_event *event = NULL; + int i, consumed = 0, n_fields = 0, ret = 0; + + /* + * Argument syntax: + * - Add synthetic event: field[;field] ... + * - Remove synthetic event: ! field[;field] ... + * where 'field' = type field_name + */ + + if (name[0] == '\0' || argc < 1) + return -EINVAL; + + mutex_lock(&event_mutex); + + event = find_synth_event(name); + if (event) { + ret = -EEXIST; + goto out; + } + + for (i = 0; i < argc - 1; i++) { + if (strcmp(argv[i], ";") == 0) + continue; + if (n_fields == SYNTH_FIELDS_MAX) { + ret = -EINVAL; + goto err; + } + + field = parse_synth_field(argc - i, &argv[i], &consumed); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto err; + } + fields[n_fields++] = field; + i += consumed - 1; + } + + if (i < argc && strcmp(argv[i], ";") != 0) { + ret = -EINVAL; + goto err; + } + + event = alloc_synth_event(name, n_fields, fields); + if (IS_ERR(event)) { + ret = PTR_ERR(event); + event = NULL; + goto err; + } + ret = register_synth_event(event); + if (!ret) + dyn_event_add(&event->devent); + else + free_synth_event(event); + out: + mutex_unlock(&event_mutex); + + return ret; + err: + for (i = 0; i < n_fields; i++) + free_synth_field(fields[i]); + + goto out; +} + +/** + * synth_event_create - Create a new synthetic event + * @name: The name of the new sythetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * @mod: The module creating the event, NULL if not created from a module + * + * Create a new synthetic event with the given name under the + * trace/events/synthetic/ directory. The event fields that will be + * defined for the event should be passed in as an array of struct + * synth_field_desc, and the number elements in the array passed in as + * n_fields. Field ordering will retain the ordering given in the + * fields array. + * + * If the new synthetic event is being created from a module, the mod + * param must be non-NULL. This will ensure that the trace buffer + * won't contain unreadable events. + * + * The new synth event should be deleted using synth_event_delete() + * function. The new synthetic event can be generated from modules or + * other kernel code using trace_synth_event() and related functions. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_create(const char *name, struct synth_field_desc *fields, + unsigned int n_fields, struct module *mod) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + ret = synth_event_gen_cmd_array_start(&cmd, name, mod, + fields, n_fields); + if (ret) + goto out; + + ret = synth_event_gen_cmd_end(&cmd); + out: + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_create); + +static int destroy_synth_event(struct synth_event *se) +{ + int ret; + + if (se->ref) + ret = -EBUSY; + else { + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); + } + } + + return ret; +} + +/** + * synth_event_delete - Delete a synthetic event + * @event_name: The name of the new sythetic event + * + * Delete a synthetic event that was created with synth_event_create(). + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_delete(const char *event_name) +{ + struct synth_event *se = NULL; + struct module *mod = NULL; + int ret = -ENOENT; + + mutex_lock(&event_mutex); + se = find_synth_event(event_name); + if (se) { + mod = se->mod; + ret = destroy_synth_event(se); + } + mutex_unlock(&event_mutex); + + if (mod) { + mutex_lock(&trace_types_lock); + /* + * It is safest to reset the ring buffer if the module + * being unloaded registered any events that were + * used. The only worry is if a new module gets + * loaded, and takes on the same id as the events of + * this module. When printing out the buffer, traced + * events left over from this module may be passed to + * the new module events and unexpected results may + * occur. + */ + tracing_reset_all_online_cpus(); + mutex_unlock(&trace_types_lock); + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_delete); + +static int create_or_delete_synth_event(int argc, char **argv) +{ + const char *name = argv[0]; + int ret; + + /* trace_run_command() ensures argc != 0 */ + if (name[0] == '!') { + ret = synth_event_delete(name + 1); + return ret; + } + + ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); + return ret == -ECANCELED ? -EINVAL : ret; +} + +static int synth_event_run_command(struct dynevent_cmd *cmd) +{ + struct synth_event *se; + int ret; + + ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); + if (ret) + return ret; + + se = find_synth_event(cmd->event_name); + if (WARN_ON(!se)) + return -ENOENT; + + se->mod = cmd->private_data; + + return ret; +} + +/** + * synth_event_cmd_init - Initialize a synthetic event command object + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @buf: A pointer to the buffer used to build the command + * @maxlen: The length of the buffer passed in @buf + * + * Initialize a synthetic event command object. Use this before + * calling any of the other dyenvent_cmd functions. + */ +void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) +{ + dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, + synth_event_run_command); +} +EXPORT_SYMBOL_GPL(synth_event_cmd_init); + +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int entry_size, fields_size = 0; + int ret = 0; + + memset(trace_state, '\0', sizeof(*trace_state)); + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } +out: + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + +/** + * synth_event_trace - Trace a synthetic event + * @file: The trace_event_file representing the synthetic event + * @n_vals: The number of values in vals + * @args: Variable number of args containing the event values + * + * Trace a synthetic event using the values passed in the variable + * argument list. + * + * The argument list should be a list 'n_vals' u64 values. The number + * of vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) +{ + struct synth_event_trace_state state; + unsigned int i, n_u64; + va_list args; + int ret; + + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + va_start(args, n_vals); + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + u64 val; + + val = va_arg(args, u64); + + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)val; + char *str_field = (char *)&state.entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } + va_end(args); +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace); + +/** + * synth_event_trace_array - Trace a synthetic event from an array + * @file: The trace_event_file representing the synthetic event + * @vals: Array of values + * @n_vals: The number of values in vals + * + * Trace a synthetic event using the values passed in as 'vals'. + * + * The 'vals' array is just an array of 'n_vals' u64. The number of + * vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_array(struct trace_event_file *file, u64 *vals, + unsigned int n_vals) +{ + struct synth_event_trace_state state; + unsigned int i, n_u64; + int ret; + + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)vals[i]; + char *str_field = (char *)&state.entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_array); + +/** + * synth_event_trace_start - Start piecewise synthetic event trace + * @file: The trace_event_file representing the synthetic event + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Start the trace of a synthetic event field-by-field rather than all + * at once. + * + * This function 'opens' an event trace, which means space is reserved + * for the event in the trace buffer, after which the event's + * individual field values can be set through either + * synth_event_add_next_val() or synth_event_add_val(). + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state until the event trace is + * closed (and the event finally traced) using + * synth_event_trace_end(). + * + * Note that synth_event_trace_end() must be called after all values + * have been added for each event trace, regardless of whether adding + * all field values succeeded or not. + * + * Note also that for a given event trace, all fields must be added + * using either synth_event_add_next_val() or synth_event_add_val() + * but not both together or interleaved. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int ret; + + if (!trace_state) + return -EINVAL; + + ret = __synth_event_trace_start(file, trace_state); + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_start); + +static int __synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + struct synth_field *field = NULL; + struct synth_trace_event *entry; + struct synth_event *event; + int i, ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + /* can't mix add_next_synth_val() with add_synth_val() */ + if (field_name) { + if (trace_state->add_next) { + ret = -EINVAL; + goto out; + } + trace_state->add_name = true; + } else { + if (trace_state->add_name) { + ret = -EINVAL; + goto out; + } + trace_state->add_next = true; + } + + if (trace_state->disabled) + goto out; + + event = trace_state->event; + if (trace_state->add_name) { + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + if (strcmp(field->name, field_name) == 0) + break; + } + if (!field) { + ret = -EINVAL; + goto out; + } + } else { + if (trace_state->cur_field >= event->n_fields) { + ret = -EINVAL; + goto out; + } + field = event->fields[trace_state->cur_field++]; + } + + entry = trace_state->entry; + if (field->is_string) { + char *str_val = (char *)(long)val; + char *str_field; + + if (!str_val) { + ret = -EINVAL; + goto out; + } + + str_field = (char *)&entry->fields[field->offset]; + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } + out: + return ret; +} + +/** + * synth_event_add_next_val - Add the next field's value to an open synth trace + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the next field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function assumes all the fields in an event are to be set one + * after another - successive calls to this function are made, one for + * each field, in the order of the fields in the event, until all + * fields have been set. If you'd rather set each field individually + * without regard to ordering, synth_event_add_val() can be used + * instead. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_next_val(u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(NULL, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_next_val); + +/** + * synth_event_add_val - Add a named field's value to an open synth trace + * @field_name: The name of the synthetic event field value to set + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the named field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function looks up the field name, and if found, sets the field + * to the specified value. This lookup makes this function more + * expensive than synth_event_add_next_val(), so use that or the + * none-piecewise synth_event_trace() instead if efficiency is more + * important. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(field_name, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_val); + +/** + * synth_event_trace_end - End piecewise synthetic event trace + * @trace_state: A pointer to object tracking the piecewise trace state + * + * End the trace of a synthetic event opened by + * synth_event_trace__start(). + * + * This function 'closes' an event trace, which basically means that + * it commits the reserved event and cleans up other loose ends. + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state opened with + * synth_event_trace_start(). + * + * Note that this function must be called after all values have been + * added for each event trace, regardless of whether adding all field + * values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + if (!trace_state) + return -EINVAL; + + __synth_event_trace_end(trace_state); + + return 0; +} +EXPORT_SYMBOL_GPL(synth_event_trace_end); + +static int create_synth_event(int argc, const char **argv) +{ + const char *name = argv[0]; + int len; + + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + + /* This interface accepts group name prefix */ + if (strchr(name, '/')) { + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) + return -EINVAL; + name += len; + } + return __create_synth_event(argc - 1, name, argv + 1); +} + +static int synth_event_release(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + int ret; + + if (event->ref) + return -EBUSY; + + ret = unregister_synth_event(event); + if (ret) + return ret; + + dyn_event_remove(ev); + free_synth_event(event); + return 0; +} + +static int __synth_event_show(struct seq_file *m, struct synth_event *event) +{ + struct synth_field *field; + unsigned int i; + + seq_printf(m, "%s\t", event->name); + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + + /* parameter values */ + seq_printf(m, "%s %s%s", field->type, field->name, + i == event->n_fields - 1 ? "" : "; "); + } + + seq_putc(m, '\n'); + + return 0; +} + +static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + seq_printf(m, "s:%s/", event->class.system); + + return __synth_event_show(m, event); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_synth_event(ev)) + return 0; + + return __synth_event_show(m, to_synth_event(ev)); +} + +static const struct seq_operations synth_events_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = synth_events_seq_show, +}; + +static int synth_events_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(&synth_event_ops); + if (ret < 0) + return ret; + } + + return seq_open(file, &synth_events_seq_op); +} + +static ssize_t synth_events_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_synth_event); +} + +static const struct file_operations synth_events_fops = { + .open = synth_events_open, + .write = synth_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int trace_events_synth_init(void) +{ + struct dentry *entry = NULL; + struct dentry *d_tracer; + int err = 0; + + err = dyn_event_register(&synth_event_ops); + if (err) { + pr_warn("Could not register synth_event_ops\n"); + return err; + } + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) { + err = PTR_ERR(d_tracer); + goto err; + } + + entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + NULL, &synth_events_fops); + if (!entry) { + err = -ENODEV; + goto err; + } + + return err; + err: + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); + + return err; +} + +fs_initcall(trace_events_synth_init); diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h new file mode 100644 index 000000000000..ac35c45207c4 --- /dev/null +++ b/kernel/trace/trace_synth.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_SYNTH_H +#define __TRACE_SYNTH_H + +#include "trace_dynevent.h" + +#define SYNTH_SYSTEM "synthetic" +#define SYNTH_FIELDS_MAX 32 + +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ + +struct synth_field { + char *type; + char *name; + size_t size; + unsigned int offset; + bool is_signed; + bool is_string; +}; + +struct synth_event { + struct dyn_event devent; + int ref; + char *name; + struct synth_field **fields; + unsigned int n_fields; + unsigned int n_u64; + struct trace_event_class class; + struct trace_event_call call; + struct tracepoint *tp; + struct module *mod; +}; + +extern struct synth_event *find_synth_event(const char *name); + +#endif /* __TRACE_SYNTH_H */ -- cgit v1.2.3 From c200784a08d4ea82f82a30678955b7f2c7550af4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 29 May 2020 10:46:32 -0400 Subject: tracing: Add a trace print when traceoff_on_warning is triggered When "traceoff_on_warning" is enabled and a warning happens, there can still be many trace events happening on other CPUs between the time the warning occurred and the last trace event on that same CPU. This can cause confusion in examining the trace, as it may not be obvious where the warning happened. By adding a trace print into the trace just before disabling tracing, it makes it obvious where the warning occurred, and the developer doesn't have to look at other means to see what CPU it occurred on. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..760fd102dbe2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1299,8 +1299,11 @@ EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { - if (__disable_trace_on_warning) + if (__disable_trace_on_warning) { + trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, + "Disabling tracing due to warning\n"); tracing_off(); + } } /** -- cgit v1.2.3 From 10cdb15759540f03d056e2f73fe26377ed7dcfda Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 1 Jun 2020 08:44:40 +0000 Subject: workqueue: use BUILD_BUG_ON() for compile time test instead of WARN_ON() Any runtime WARN_ON() has to be fixed, and BUILD_BUG_ON() can help you nitice it earlier. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c667ca5aed61..9fbe1e237563 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5917,7 +5917,7 @@ void __init workqueue_init_early(void) int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); -- cgit v1.2.3 From fe537393b5795ecbe5746eec0e16124bc998a594 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 25 May 2020 14:29:28 +0200 Subject: bpf: Fix returned error sign when link doesn't support updates System calls encode returned errors as negative values. Fix a typo that breaks this convention for bpf(LINK_UPDATE) when bpf_link doesn't support update operation. Fixes: f9d041271cf4 ("bpf: Refactor bpf_link update handling") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525122928.1164495-1-jakub@cloudflare.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aaa29fb6f363..d13b804ff045 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3924,7 +3924,7 @@ static int link_update(union bpf_attr *attr) if (link->ops->update_prog) ret = link->ops->update_prog(link, new_prog, old_prog); else - ret = EINVAL; + ret = -EINVAL; out_put_progs: if (old_prog) -- cgit v1.2.3 From 0142dddcbe965450338076c486d0d757b3184352 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Tue, 26 May 2020 11:00:24 +1200 Subject: bpf: Fix spelling in comment explaining ARG1 in ___bpf_prog_run Change 'handeled' to 'handled'. Signed-off-by: Chris Packham Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525230025.14470-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c40ff4cf9880..af52ca658c73 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1543,7 +1543,7 @@ select_insn: /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is - * handeled like a helper, that is, bpf_tail_call_proto, + * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; -- cgit v1.2.3 From f470378c7562a2818b45ed11c98973f2b89eedd3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:50:55 -0700 Subject: bpf: Extend bpf_base_func_proto helpers with probe_* and *current_task* Often it is useful when applying policy to know something about the task. If the administrator has CAP_SYS_ADMIN rights then they can use kprobe + networking hook and link the two programs together to accomplish this. However, this is a bit clunky and also means we have to call both the network program and kprobe program when we could just use a single program and avoid passing metadata through sk_msg/skb->cb, socket, maps, etc. To accomplish this add probe_* helpers to bpf_base_func_proto programs guarded by a perfmon_capable() check. New supported helpers are the following, BPF_FUNC_get_current_task BPF_FUNC_probe_read_user BPF_FUNC_probe_read_kernel BPF_FUNC_probe_read_user_str BPF_FUNC_probe_read_kernel_str Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033905529.12355.4368381069655254932.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 24 ++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 10 +++++----- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 886949fdcece..bb4fb634275e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,12 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; + const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -647,6 +653,24 @@ bpf_base_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + default: + break; + } + + if (!perfmon_capable()) + return NULL; + + switch (func_id) { + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9531f54d0a3a..187cd6995bbb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -147,7 +147,7 @@ BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_proto = { +const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, .ret_type = RET_INTEGER, @@ -167,7 +167,7 @@ BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_str_proto = { +const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -198,7 +198,7 @@ BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_proto = { .func = bpf_probe_read_kernel, .gpl_only = true, .ret_type = RET_INTEGER, @@ -253,7 +253,7 @@ BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .func = bpf_probe_read_kernel_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -907,7 +907,7 @@ BPF_CALL_0(bpf_get_current_task) return (long) current; } -static const struct bpf_func_proto bpf_get_current_task_proto = { +const struct bpf_func_proto bpf_get_current_task_proto = { .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, -- cgit v1.2.3 From 1ea0f9120c8ce105ca181b070561df5cbd6bc049 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:59 +0000 Subject: bpf: Fix map permissions check The map_lookup_and_delete_elem() function should check for both FMODE_CAN_WRITE and FMODE_CAN_READ permissions because it returns a map element to user space. Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-5-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d13b804ff045..2c969a9b90d3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1472,7 +1472,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } -- cgit v1.2.3 From 457f44363a8894135c85b7a9afd2bd8196db24ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:20 -0700 Subject: bpf: Implement BPF ring buffer and verifier support for it This commit adds a new MPSC ring buffer implementation into BPF ecosystem, which allows multiple CPUs to submit data to a single shared ring buffer. On the consumption side, only single consumer is assumed. Motivation ---------- There are two distinctive motivators for this work, which are not satisfied by existing perf buffer, which prompted creation of a new ring buffer implementation. - more efficient memory utilization by sharing ring buffer across CPUs; - preserving ordering of events that happen sequentially in time, even across multiple CPUs (e.g., fork/exec/exit events for a task). These two problems are independent, but perf buffer fails to satisfy both. Both are a result of a choice to have per-CPU perf ring buffer. Both can be also solved by having an MPSC implementation of ring buffer. The ordering problem could technically be solved for perf buffer with some in-kernel counting, but given the first one requires an MPSC buffer, the same solution would solve the second problem automatically. Semantics and APIs ------------------ Single ring buffer is presented to BPF programs as an instance of BPF map of type BPF_MAP_TYPE_RINGBUF. Two other alternatives considered, but ultimately rejected. One way would be to, similar to BPF_MAP_TYPE_PERF_EVENT_ARRAY, make BPF_MAP_TYPE_RINGBUF could represent an array of ring buffers, but not enforce "same CPU only" rule. This would be more familiar interface compatible with existing perf buffer use in BPF, but would fail if application needed more advanced logic to lookup ring buffer by arbitrary key. HASH_OF_MAPS addresses this with current approach. Additionally, given the performance of BPF ringbuf, many use cases would just opt into a simple single ring buffer shared among all CPUs, for which current approach would be an overkill. Another approach could introduce a new concept, alongside BPF map, to represent generic "container" object, which doesn't necessarily have key/value interface with lookup/update/delete operations. This approach would add a lot of extra infrastructure that has to be built for observability and verifier support. It would also add another concept that BPF developers would have to familiarize themselves with, new syntax in libbpf, etc. But then would really provide no additional benefits over the approach of using a map. BPF_MAP_TYPE_RINGBUF doesn't support lookup/update/delete operations, but so doesn't few other map types (e.g., queue and stack; array doesn't support delete, etc). The approach chosen has an advantage of re-using existing BPF map infrastructure (introspection APIs in kernel, libbpf support, etc), being familiar concept (no need to teach users a new type of object in BPF program), and utilizing existing tooling (bpftool). For common scenario of using a single ring buffer for all CPUs, it's as simple and straightforward, as would be with a dedicated "container" object. On the other hand, by being a map, it can be combined with ARRAY_OF_MAPS and HASH_OF_MAPS map-in-maps to implement a wide variety of topologies, from one ring buffer for each CPU (e.g., as a replacement for perf buffer use cases), to a complicated application hashing/sharding of ring buffers (e.g., having a small pool of ring buffers with hashed task's tgid being a look up key to preserve order, but reduce contention). Key and value sizes are enforced to be zero. max_entries is used to specify the size of ring buffer and has to be a power of 2 value. There are a bunch of similarities between perf buffer (BPF_MAP_TYPE_PERF_EVENT_ARRAY) and new BPF ring buffer semantics: - variable-length records; - if there is no more space left in ring buffer, reservation fails, no blocking; - memory-mappable data area for user-space applications for ease of consumption and high performance; - epoll notifications for new incoming data; - but still the ability to do busy polling for new data to achieve the lowest latency, if necessary. BPF ringbuf provides two sets of APIs to BPF programs: - bpf_ringbuf_output() allows to *copy* data from one place to a ring buffer, similarly to bpf_perf_event_output(); - bpf_ringbuf_reserve()/bpf_ringbuf_commit()/bpf_ringbuf_discard() APIs split the whole process into two steps. First, a fixed amount of space is reserved. If successful, a pointer to a data inside ring buffer data area is returned, which BPF programs can use similarly to a data inside array/hash maps. Once ready, this piece of memory is either committed or discarded. Discard is similar to commit, but makes consumer ignore the record. bpf_ringbuf_output() has disadvantage of incurring extra memory copy, because record has to be prepared in some other place first. But it allows to submit records of the length that's not known to verifier beforehand. It also closely matches bpf_perf_event_output(), so will simplify migration significantly. bpf_ringbuf_reserve() avoids the extra copy of memory by providing a memory pointer directly to ring buffer memory. In a lot of cases records are larger than BPF stack space allows, so many programs have use extra per-CPU array as a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs completely. But in exchange, it only allows a known constant size of memory to be reserved, such that verifier can verify that BPF program can't access memory outside its reserved record space. bpf_ringbuf_output(), while slightly slower due to extra memory copy, covers some use cases that are not suitable for bpf_ringbuf_reserve(). The difference between commit and discard is very small. Discard just marks a record as discarded, and such records are supposed to be ignored by consumer code. Discard is useful for some advanced use-cases, such as ensuring all-or-nothing multi-record submission, or emulating temporary malloc()/free() within single BPF program invocation. Each reserved record is tracked by verifier through existing reference-tracking logic, similar to socket ref-tracking. It is thus impossible to reserve a record, but forget to submit (or discard) it. bpf_ringbuf_query() helper allows to query various properties of ring buffer. Currently 4 are supported: - BPF_RB_AVAIL_DATA returns amount of unconsumed data in ring buffer; - BPF_RB_RING_SIZE returns the size of ring buffer; - BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical possition of consumer/producer, respectively. Returned values are momentarily snapshots of ring buffer state and could be off by the time helper returns, so this should be used only for debugging/reporting reasons or for implementing various heuristics, that take into account highly-changeable nature of some of those characteristics. One such heuristic might involve more fine-grained control over poll/epoll notifications about new data availability in ring buffer. Together with BPF_RB_NO_WAKEUP/BPF_RB_FORCE_WAKEUP flags for output/commit/discard helpers, it allows BPF program a high degree of control and, e.g., more efficient batched notifications. Default self-balancing strategy, though, should be adequate for most applications and will work reliable and efficiently already. Design and implementation ------------------------- This reserve/commit schema allows a natural way for multiple producers, either on different CPUs or even on the same CPU/in the same BPF program, to reserve independent records and work with them without blocking other producers. This means that if BPF program was interruped by another BPF program sharing the same ring buffer, they will both get a record reserved (provided there is enough space left) and can work with it and submit it independently. This applies to NMI context as well, except that due to using a spinlock during reservation, in NMI context, bpf_ringbuf_reserve() might fail to get a lock, in which case reservation will fail even if ring buffer is not full. The ring buffer itself internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters (which might wrap around on 32-bit architectures, that's not a problem): - consumer counter shows up to which logical position consumer consumed the data; - producer counter denotes amount of data reserved by all producers. Each time a record is reserved, producer that "owns" the record will successfully advance producer counter. At that point, data is still not yet ready to be consumed, though. Each record has 8 byte header, which contains the length of reserved record, as well as two extra bits: busy bit to denote that record is still being worked on, and discard bit, which might be set at commit time if record is discarded. In the latter case, consumer is supposed to skip the record and move on to the next one. Record header also encodes record's relative offset from the beginning of ring buffer data area (in pages). This allows bpf_ringbuf_commit()/bpf_ringbuf_discard() to accept only the pointer to the record itself, without requiring also the pointer to ring buffer itself. Ring buffer memory location will be restored from record metadata header. This significantly simplifies verifier, as well as improving API usability. Producer counter increments are serialized under spinlock, so there is a strict ordering between reservations. Commits, on the other hand, are completely lockless and independent. All records become available to consumer in the order of reservations, but only after all previous records where already committed. It is thus possible for slow producers to temporarily hold off submitted records, that were reserved later. Reservation/commit/consumer protocol is verified by litmus tests in Documentation/litmus-test/bpf-rb. One interesting implementation bit, that significantly simplifies (and thus speeds up as well) implementation of both producers and consumers is how data area is mapped twice contiguously back-to-back in the virtual memory. This allows to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. See comment and a simple ASCII diagram showing this visually in bpf_ringbuf_area_alloc(). Another feature that distinguishes BPF ringbuf from perf ring buffer is a self-pacing notifications of new data being availability. bpf_ringbuf_commit() implementation will send a notification of new record being available after commit only if consumer has already caught up right up to the record being committed. If not, consumer still has to catch up and thus will see new data anyways without needing an extra poll notification. Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c) show that this allows to achieve a very high throughput without having to resort to tricks like "notify only every Nth sample", which are necessary with perf buffer. For extreme cases, when BPF program wants more manual control of notifications, commit/discard/output helpers accept BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP flags, which give full control over notifications of data availability, but require extra caution and diligence in using this API. Comparison to alternatives -------------------------- Before considering implementing BPF ring buffer from scratch existing alternatives in kernel were evaluated, but didn't seem to meet the needs. They largely fell into few categores: - per-CPU buffers (perf, ftrace, etc), which don't satisfy two motivations outlined above (ordering and memory consumption); - linked list-based implementations; while some were multi-producer designs, consuming these from user-space would be very complicated and most probably not performant; memory-mapping contiguous piece of memory is simpler and more performant for user-space consumers; - io_uring is SPSC, but also requires fixed-sized elements. Naively turning SPSC queue into MPSC w/ lock would have subpar performance compared to locked reserve + lockless commit, as with BPF ring buffer. Fixed sized elements would be too limiting for BPF programs, given existing BPF programs heavily rely on variable-sized perf buffer already; - specialized implementations (like a new printk ring buffer, [0]) with lots of printk-specific limitations and implications, that didn't seem to fit well for intended use with BPF programs. [0] https://lwn.net/Articles/779550/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-2-andriin@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 10 + kernel/bpf/ringbuf.c | 501 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 12 ++ kernel/bpf/verifier.c | 195 +++++++++++++----- kernel/trace/bpf_trace.c | 10 + 6 files changed, 680 insertions(+), 50 deletions(-) create mode 100644 kernel/bpf/ringbuf.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 375b933010dd..8fca02f64811 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4fb634275e..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..180414bb0d3e --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,501 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + if (!data_sz || !PAGE_ALIGNED(data_sz)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (data_sz > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c969a9b90d3..9de3540fa90c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -662,6 +663,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d725a26f66e..5c7bbaac81ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -408,7 +409,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL; + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -422,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -436,7 +440,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } static bool may_be_acquire_function(enum bpf_func_id func_id) @@ -444,7 +450,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem; + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; } static bool is_acquire_function(enum bpf_func_id func_id, @@ -454,7 +461,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp) + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -494,6 +502,8 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", }; static char slot_type_char[] = { @@ -2468,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; + + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2514,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2527,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } + + return 0; +} - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2596,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2631,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -3220,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3557,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3661,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3720,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3791,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3893,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3929,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -4655,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -6611,6 +6706,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_TCP_SOCK; } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; } if (is_null) { /* We don't need id and ref_obj_id from this point diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187cd6995bbb..3767d34114c0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: return NULL; } -- cgit v1.2.3 From b36e62eb85215a60916f910070f6d494b4f3e73a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 28 May 2020 17:48:10 -0700 Subject: bpf: Use strncpy_from_unsafe_strict() in bpf_seq_printf() helper In bpf_seq_printf() helper, when user specified a "%s" in the format string, strncpy_from_unsafe() is used to read the actual string to a buffer. The string could be a format string or a string in the kernel data structure. It is really unlikely that the string will reside in the user memory. This is different from Commit b2a5212fb634 ("bpf: Restrict bpf_trace_printk()'s %s usage and add %pks, %pus specifier") which still used strncpy_from_unsafe() for "%s" to preserve the old behavior. If in the future, bpf_seq_printf() indeed needs to read user memory, we can implement "%pus" format string. Based on discussion in [1], if the intent is to read kernel memory, strncpy_from_unsafe_strict() should be used. So this patch changed to use strncpy_from_unsafe_strict(). [1]: https://lore.kernel.org/bpf/20200521152301.2587579-1-hch@lst.de/T/ Fixes: 492e639f0c22 ("bpf: Add bpf_seq_printf and bpf_seq_write helpers") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200529004810.3352219-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3767d34114c0..b6c24be5ff53 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -585,9 +585,9 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, goto out; } - err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - MAX_SEQ_PRINTF_STR_LEN); + err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); if (err < 0) bufs->buf[memcpy_cnt][0] = '\0'; params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; -- cgit v1.2.3 From 7f1c04269fe7b3293dea38ea65da4fd6614d6f80 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:12 -0600 Subject: devmap: Formalize map value as a named struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 'struct bpf_devmap_val' to formalize the expected values that can be passed in for a DEVMAP. Update devmap code to use the struct. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-2-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a51d9fb7a359..a1459de0914e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,12 +60,18 @@ struct xdp_dev_bulk_queue { unsigned int count; }; +/* DEVMAP values */ +struct bpf_devmap_val { + u32 ifindex; /* device index */ +}; + struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; struct rcu_head rcu; unsigned int idx; + struct bpf_devmap_val val; }; struct bpf_dtab { @@ -472,18 +478,15 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) @@ -541,7 +544,7 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, - u32 ifindex, + struct bpf_devmap_val *val, unsigned int idx) { struct bpf_dtab_netdev *dev; @@ -551,16 +554,18 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev) return ERR_PTR(-ENOMEM); - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - kfree(dev); - return ERR_PTR(-EINVAL); - } + dev->dev = dev_get_by_index(net, val->ifindex); + if (!dev->dev) + goto err_out; dev->idx = idx; dev->dtab = dtab; + dev->val.ifindex = val->ifindex; return dev; +err_out: + kfree(dev); + return ERR_PTR(-EINVAL); } static int __dev_map_update_elem(struct net *net, struct bpf_map *map, @@ -568,7 +573,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -578,10 +583,13 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (!ifindex) { + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (!val.ifindex) { dev = NULL; } else { - dev = __dev_map_alloc_node(net, dtab, ifindex, i); + dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } @@ -609,12 +617,15 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; - if (unlikely(map_flags > BPF_EXIST || !ifindex)) + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); @@ -623,7 +634,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; - dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; -- cgit v1.2.3 From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:13 -0600 Subject: bpf: Add support to attach bpf program to a devmap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add BPF_XDP_DEVMAP attach type for use with programs associated with a DEVMAP entry. Allow DEVMAPs to associate a program with a device entry by adding a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program id, so the fd and id are a union. bpf programs can get access to the struct via vmlinux.h. The program associated with the fd must have type XDP with expected attach type BPF_XDP_DEVMAP. When a program is associated with a device index, the program is run on an XDP_REDIRECT and before the buffer is added to the per-cpu queue. At this point rxq data is still valid; the next patch adds tx device information allowing the prorgam to see both ingress and egress device indices. XDP generic is skb based and XDP programs do not work with skb's. Block the use case by walking maps used by a program that is to be attached via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with Block attach of BPF_XDP_DEVMAP programs to devices. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a1459de0914e..0089d56617ec 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue { /* DEVMAP values */ struct bpf_devmap_val { u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; @@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); @@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) @@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; -- cgit v1.2.3 From 64b59025c15b244c0954cf52b24fbabfcf5ed8f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:14 -0600 Subject: xdp: Add xdp_txq_info to xdp_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the moment only the device is added. Other fields (queue_index) can be added as use cases arise. >From a UAPI perspective, add egress_ifindex to xdp context for bpf programs to see the Tx device. Update the verifier to only allow accesses to egress_ifindex by XDP programs with BPF_XDP_DEVMAP expected attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-4-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0089d56617ec..c04fb1c72f5e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -476,8 +476,11 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp->txq = &txq; + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: -- cgit v1.2.3 From bb2359f4dbe98e8863b4e885fc09269ef4682ec3 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 1 Jun 2020 19:28:14 +0300 Subject: bpf: Change kvfree to kfree in generic_map_lookup_batch() buf_prevkey in generic_map_lookup_batch() is allocated with kmalloc(). It's safe to free it with kfree(). Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op") Signed-off-by: Denis Efremov Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200601162814.17426-1-efremov@linux.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9de3540fa90c..e83b0818b529 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1399,7 +1399,7 @@ int generic_map_lookup_batch(struct bpf_map *map, buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { - kvfree(buf_prevkey); + kfree(buf_prevkey); return -ENOMEM; } -- cgit v1.2.3 From 1b698fa5d8ef958007c455e316aa44c37ab3c5fb Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 28 May 2020 22:47:29 +0200 Subject: xdp: Rename convert_to_xdp_frame in xdp_convert_buff_to_frame In order to use standard 'xdp' prefix, rename convert_to_xdp_frame utility routine in xdp_convert_buff_to_frame and replace all the occurrences Signed-off-by: Lorenzo Bianconi Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/6344f739be0d1a08ab2b9607584c4d5478c8c083.1590698295.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 2 +- kernel/bpf/devmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8b85bfddfac7..27595fc6da56 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -621,7 +621,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, { struct xdp_frame *xdpf; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c04fb1c72f5e..854b09beb16b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -465,7 +465,7 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(err)) return err; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; -- cgit v1.2.3 From 958a3f2d2aff896ae2a622878e456114f4a4cd15 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 31 May 2020 17:42:55 +0200 Subject: bpf: Use tracing helpers for lsm programs Currenty lsm uses bpf_tracing_func_proto helpers which do not include stack trace or perf event output. It's useful to have those for bpftrace lsm support [1]. Using tracing_prog_func_proto helpers for lsm programs. [1] https://github.com/iovisor/bpftrace/pull/1347 Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Cc: KP Singh Link: https://lore.kernel.org/bpf/20200531154255.896551-1-jolsa@kernel.org --- kernel/bpf/bpf_lsm.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 19636703b24e..fb278144e9fd 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = bpf_tracing_func_proto, + .get_func_proto = tracing_prog_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b6c24be5ff53..c41186417d93 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1467,7 +1467,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * +const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { -- cgit v1.2.3 From a3fd7ceee05431d2c51ed86c6cae015d236a51f0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:36 +0200 Subject: net: Introduce netns_bpf for BPF programs attached to netns In order to: (1) attach more than one BPF program type to netns, or (2) support attaching BPF programs to netns with bpf_link, or (3) support multi-prog attach points for netns we will need to keep more state per netns than a single pointer like we have now for BPF flow dissector program. Prepare for the above by extracting netns_bpf that is part of struct net, for storing all state related to BPF programs attached to netns. Turn flow dissector callbacks for querying/attaching/detaching a program into generic ones that operate on netns_bpf. Next patch will move the generic callbacks into their own module. This is similar to how it is organized for cgroup with cgroup_bpf. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Cc: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-3-jakub@cloudflare.com --- kernel/bpf/syscall.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e83b0818b529..c77ab9c76f7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2868,7 +2869,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2908,7 +2909,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return skb_flow_dissector_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2961,7 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_prog_query(attr, uattr); + return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; } -- cgit v1.2.3 From b27f7bb590ba835b32ef122389db158e44cfda1e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:37 +0200 Subject: flow_dissector: Move out netns_bpf prog callbacks Move functions to manage BPF programs attached to netns that are not specific to flow dissector to a dedicated module named bpf/net_namespace.c. The set of functions will grow with the addition of bpf_link support for netns attached programs. This patch prepares ground by creating a place for it. This is a code move with no functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-4-jakub@cloudflare.com --- kernel/bpf/Makefile | 1 + kernel/bpf/net_namespace.c | 133 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 kernel/bpf/net_namespace.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 8fca02f64811..1131a921e1a6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,6 +13,7 @@ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o +obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c new file mode 100644 index 000000000000..b37d81450c3a --- /dev/null +++ b/kernel/bpf/net_namespace.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +/* + * Functions to manage BPF programs attached to netns + */ + +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); + +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->bpf.progs[type]); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type type; + struct net *net; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *attached; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) + return -ENOENT; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + bpf_prog_put(attached); + return 0; +} + +int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) +{ + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); +} + +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, +}; + +static int __init netns_bpf_init(void) +{ + return register_pernet_subsys(&netns_bpf_pernet_ops); +} + +subsys_initcall(netns_bpf_init); -- cgit v1.2.3 From 7f045a49fee04b5662cbdeaf0838f9322ae8c63a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:38 +0200 Subject: bpf: Add link-based BPF program attachment to network namespace Extend bpf() syscall subcommands that operate on bpf_link, that is LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO, to accept attach types tied to network namespaces (only flow dissector at the moment). Link-based and prog-based attachment can be used interchangeably, but only one can exist at a time. Attempts to attach a link when a prog is already attached directly, and the other way around, will be met with -EEXIST. Attempts to detach a program when link exists result in -EINVAL. Attachment of multiple links of same attach type to one netns is not supported with the intention to lift the restriction when a use-case presents itself. Because of that link create returns -E2BIG when trying to create another netns link, when one already exists. Link-based attachments to netns don't keep a netns alive by holding a ref to it. Instead links get auto-detached from netns when the latter is being destroyed, using a pernet pre_exit callback. When auto-detached, link lives in defunct state as long there are open FDs for it. -ENOLINK is returned if a user tries to update a defunct link. Because bpf_link to netns doesn't hold a ref to struct net, special care is taken when releasing, updating, or filling link info. The netns might be getting torn down when any of these link operations are in progress. That is why auto-detach and update/release/fill_info are synchronized by the same mutex. Also, link ops have to always check if auto-detach has not happened yet and if netns is still alive (refcnt > 0). Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-5-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 244 ++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 3 + 2 files changed, 245 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b37d81450c3a..78cf061f8179 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -8,9 +8,140 @@ * Functions to manage BPF programs attached to netns */ +struct bpf_netns_link { + struct bpf_link link; + enum bpf_attach_type type; + enum netns_bpf_attach_type netns_type; + + /* We don't hold a ref to net in order to auto-detach the link + * when netns is going away. Instead we rely on pernet + * pre_exit callback to clear this pointer. Must be accessed + * with netns_bpf_mutex held. + */ + struct net *net; +}; + /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +/* Must be called with netns_bpf_mutex held. */ +static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + net_link->net = NULL; +} + +static void bpf_netns_link_release(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + + /* Link auto-detached by dying netns. */ + if (!net_link->net) + return; + + mutex_lock(&netns_bpf_mutex); + + /* Recheck after potential sleep. We can race with cleanup_net + * here, but if we see a non-NULL struct net pointer pre_exit + * has not happened yet and will block on netns_bpf_mutex. + */ + net = net_link->net; + if (!net) + goto out_unlock; + + net->bpf.links[type] = NULL; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); +} + +static void bpf_netns_link_dealloc(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + kfree(net_link); +} + +static int bpf_netns_link_update_prog(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + int ret = 0; + + if (old_prog && old_prog != link->prog) + return -EPERM; + if (new_prog->type != link->prog->type) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + + net = net_link->net; + if (!net || !check_net(net)) { + /* Link auto-detached or netns dying */ + ret = -ENOLINK; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + rcu_assign_pointer(net->bpf.progs[type], new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return ret; +} + +static int bpf_netns_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + unsigned int inum = 0; + struct net *net; + + mutex_lock(&netns_bpf_mutex); + net = net_link->net; + if (net && check_net(net)) + inum = net->ns.inum; + mutex_unlock(&netns_bpf_mutex); + + info->netns.netns_ino = inum; + info->netns.attach_type = net_link->type; + return 0; +} + +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_link_info info = {}; + + bpf_netns_link_fill_info(link, &info); + seq_printf(seq, + "netns_ino:\t%u\n" + "attach_type:\t%u\n", + info.netns.netns_ino, + info.netns.attach_type); +} + +static const struct bpf_link_ops bpf_netns_link_ops = { + .release = bpf_netns_link_release, + .dealloc = bpf_netns_link_dealloc, + .update_prog = bpf_netns_link_update_prog, + .fill_link_info = bpf_netns_link_fill_info, + .show_fdinfo = bpf_netns_link_show_fdinfo, +}; + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -67,6 +198,13 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); + + /* Attaching prog directly is not compatible with links */ + if (net->bpf.links[type]) { + ret = -EEXIST; + goto out_unlock; + } + switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach(net, prog); @@ -75,6 +213,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) ret = -EINVAL; break; } +out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; @@ -86,6 +225,10 @@ static int __netns_bpf_prog_detach(struct net *net, { struct bpf_prog *attached; + /* Progs attached via links cannot be detached */ + if (net->bpf.links[type]) + return -EINVAL; + attached = rcu_dereference_protected(net->bpf.progs[type], lockdep_is_held(&netns_bpf_mutex)); if (!attached) @@ -111,13 +254,110 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) return ret; } +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *prog; + int err; + + mutex_lock(&netns_bpf_mutex); + + /* Allow attaching only one prog or link for now */ + if (net->bpf.links[type]) { + err = -E2BIG; + goto out_unlock; + } + /* Links are not compatible with attaching prog directly */ + prog = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (prog) { + err = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + err = flow_dissector_bpf_prog_attach(net, link->prog); + break; + default: + err = -EINVAL; + break; + } + if (err) + goto out_unlock; + + net->bpf.links[type] = link; + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return err; +} + +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type netns_type; + struct bpf_link_primer link_primer; + struct bpf_netns_link *net_link; + enum bpf_attach_type type; + struct net *net; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + type = attr->link_create.attach_type; + netns_type = to_netns_bpf_attach_type(type); + if (netns_type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->link_create.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + net_link = kzalloc(sizeof(*net_link), GFP_USER); + if (!net_link) { + err = -ENOMEM; + goto out_put_net; + } + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, + &bpf_netns_link_ops, prog); + net_link->net = net; + net_link->type = type; + net_link->netns_type = netns_type; + + err = bpf_link_prime(&net_link->link, &link_primer); + if (err) { + kfree(net_link); + goto out_put_net; + } + + err = netns_bpf_link_attach(net, &net_link->link, netns_type); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_net; + } + + put_net(net); + return bpf_link_settle(&link_primer); + +out_put_net: + put_net(net); + return err; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; + struct bpf_link *link; mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + link = net->bpf.links[type]; + if (link) + bpf_netns_link_auto_detach(link); + else + __netns_bpf_prog_detach(net, type); + } mutex_unlock(&netns_bpf_mutex); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c77ab9c76f7b..e14a842d7e0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3887,6 +3887,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: ret = tracing_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = netns_bpf_link_create(attr, prog); + break; default: ret = -EINVAL; } -- cgit v1.2.3 From 0c047ecbb7bab4c1d2136f5f04bb47a66a9a12b8 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:39 +0200 Subject: bpf, cgroup: Return ENOLINK for auto-detached links on update Failure to update a bpf_link because it has been auto-detached by a dying cgroup currently results in EINVAL error, even though the arguments passed to bpf() syscall are not wrong. bpf_links attaching to netns in this case will return ENOLINK, which carries the message that the link is no longer attached to anything. Change cgroup bpf_links to do the same to keep the uAPI errors consistent. Fixes: 0c991ebc8c69 ("bpf: Implement bpf_prog replacement for an active bpf_cgroup_link") Suggested-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-6-jakub@cloudflare.com --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5c0e964105ac..fdf7836750a3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -595,7 +595,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, mutex_lock(&cgroup_mutex); /* link might have been auto-released by dying cgroup, so fail */ if (!cg_link->cgroup) { - ret = -EINVAL; + ret = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { -- cgit v1.2.3 From 25de110d148666752dc0e0da7a0b69de31cd7098 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 Jun 2020 12:08:39 +0200 Subject: irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too Some SMP platforms don't have CONFIG_IRQ_WORK defined, resulting in a link error at build time. Define a stub and clean up the prototype definitions. Reported-by: kbuild test robot Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4dec04f7fdc5..c80486a7e3b8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,8 +194,6 @@ void generic_smp_call_function_single_interrupt(void) flush_smp_call_function_queue(true); } -extern void irq_work_single(void *); - /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * -- cgit v1.2.3 From b1350132fef7e1f0ddfd5a985d516a6ed7a329fc Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 26 May 2020 14:20:06 -0700 Subject: kgdb: Don't call the deinit under spinlock When I combined kgdboc_earlycon with an inflight patch titled ("soc: qcom-geni-se: Add interconnect support to fix earlycon crash") [1] things went boom. Specifically I got a crash during the transition between kgdboc_earlycon and the main kgdboc that looked like this: Call trace: __schedule_bug+0x68/0x6c __schedule+0x75c/0x924 schedule+0x8c/0xbc schedule_timeout+0x9c/0xfc do_wait_for_common+0xd0/0x160 wait_for_completion_timeout+0x54/0x74 rpmh_write_batch+0x1fc/0x23c qcom_icc_bcm_voter_commit+0x1b4/0x388 qcom_icc_set+0x2c/0x3c apply_constraints+0x5c/0x98 icc_set_bw+0x204/0x3bc icc_put+0x30/0xf8 geni_remove_earlycon_icc_vote+0x6c/0x9c qcom_geni_serial_earlycon_exit+0x10/0x1c kgdboc_earlycon_deinit+0x38/0x58 kgdb_register_io_module+0x11c/0x194 configure_kgdboc+0x108/0x174 kgdboc_probe+0x38/0x60 platform_drv_probe+0x90/0xb0 really_probe+0x130/0x2fc ... The problem was that we were holding the "kgdb_registration_lock" while calling into code that didn't expect to be called in spinlock context. Let's slightly defer when we call the deinit code so that it's not done under spinlock. NOTE: this does mean that the "deinit" call of the old kgdb IO module is now made _after_ the init of the new IO module, but presumably that's OK. [1] https://lkml.kernel.org/r/1588919619-21355-3-git-send-email-akashast@codeaurora.org Fixes: 220995622da5 ("kgdboc: Add kgdboc_earlycon to support early kgdb using boot consoles") Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200526142001.1.I523dc33f96589cb9956f5679976d402c8cda36fa@changeid [daniel.thompson@linaro.org: Resolved merge issues by hand] Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 4d59aa907fdc..ef94e906f05a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1089,7 +1089,6 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) } pr_info("Replacing I/O driver %s with %s\n", old_dbg_io_ops->name, new_dbg_io_ops->name); - old_dbg_io_ops->deinit(); } if (new_dbg_io_ops->init) { @@ -1104,8 +1103,10 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - if (old_dbg_io_ops) + if (old_dbg_io_ops) { + old_dbg_io_ops->deinit(); return 0; + } pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); -- cgit v1.2.3 From 1b310030bb855b9b13d1c0a9feffdb54883b06ab Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 16:11:46 -0700 Subject: kdb: Cleanup math with KDB_CMD_HISTORY_COUNT From code inspection the math in handle_ctrl_cmd() looks super sketchy because it subjects -1 from cmdptr and then does a "% KDB_CMD_HISTORY_COUNT". It turns out that this code works because "cmdptr" is unsigned and KDB_CMD_HISTORY_COUNT is a nice power of 2. Let's make this a little less sketchy. This patch should be a no-op. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200507161125.1.I2cce9ac66e141230c3644b8174b6c15d4e769232@changeid Reviewed-by: Sumit Garg Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 515379cbf209..6865a0f58d38 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1108,7 +1108,8 @@ static int handle_ctrl_cmd(char *cmd) switch (*cmd) { case CTRL_P: if (cmdptr != cmd_tail) - cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; + cmdptr = (cmdptr + KDB_CMD_HISTORY_COUNT - 1) % + KDB_CMD_HISTORY_COUNT; strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); return 1; case CTRL_N: -- cgit v1.2.3 From c893de12e1ef17b581eb2cf8fc9018ec0cbd07df Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 21 May 2020 15:21:25 +0800 Subject: kdb: Remove the misfeature 'KDBFLAGS' Currently, 'KDBFLAGS' is an internal variable of kdb, it is combined by 'KDBDEBUG' and state flags. It will be shown only when 'KDBDEBUG' is set, and the user can define an environment variable named 'KDBFLAGS' too. These are puzzling indeed. After communication with Daniel, it seems that 'KDBFLAGS' is a misfeature. So let's replace 'KDBFLAGS' with 'KDBDEBUG' to just show the value we wrote into. After this modification, we can use `md4c1 kdb_flags` instead, to observe the state flags. Suggested-by: Daniel Thompson Signed-off-by: Wei Li Link: https://lore.kernel.org/r/20200521072125.21103-1-liwei391@huawei.com [daniel.thompson@linaro.org: Make kdb_flags unsigned to avoid arithmetic right shift] Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 6865a0f58d38..ec190569f690 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -62,7 +62,7 @@ int kdb_grep_trailing; /* * Kernel debugger state flags */ -int kdb_flags; +unsigned int kdb_flags; /* * kdb_lock protects updates to kdb_initial_cpu. Used to @@ -418,8 +418,7 @@ int kdb_set(int argc, const char **argv) argv[2]); return 0; } - kdb_flags = (kdb_flags & - ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) + kdb_flags = (kdb_flags & ~KDB_DEBUG(MASK)) | (debugflags << KDB_DEBUG_FLAG_SHIFT); return 0; @@ -2082,7 +2081,8 @@ static int kdb_env(int argc, const char **argv) } if (KDB_DEBUG(MASK)) - kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); + kdb_printf("KDBDEBUG=0x%x\n", + (kdb_flags & KDB_DEBUG(MASK)) >> KDB_DEBUG_FLAG_SHIFT); return 0; } -- cgit v1.2.3 From a37b0715ddf3007734c4e2424c14bc7efcdd1190 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Jun 2020 21:48:18 -0700 Subject: mm/writeback: replace PF_LESS_THROTTLE with PF_LOCAL_THROTTLE PF_LESS_THROTTLE exists for loop-back nfsd (and a similar need in the loop block driver and callers of prctl(PR_SET_IO_FLUSHER)), where a daemon needs to write to one bdi (the final bdi) in order to free up writes queued to another bdi (the client bdi). The daemon sets PF_LESS_THROTTLE and gets a larger allowance of dirty pages, so that it can still dirty pages after other processses have been throttled. The purpose of this is to avoid deadlock that happen when the PF_LESS_THROTTLE process must write for any dirty pages to be freed, but it is being thottled and cannot write. This approach was designed when all threads were blocked equally, independently on which device they were writing to, or how fast it was. Since that time the writeback algorithm has changed substantially with different threads getting different allowances based on non-trivial heuristics. This means the simple "add 25%" heuristic is no longer reliable. The important issue is not that the daemon needs a *larger* dirty page allowance, but that it needs a *private* dirty page allowance, so that dirty pages for the "client" bdi that it is helping to clear (the bdi for an NFS filesystem or loop block device etc) do not affect the throttling of the daemon writing to the "final" bdi. This patch changes the heuristic so that the task is not throttled when the bdi it is writing to has a dirty page count below below (or equal to) the free-run threshold for that bdi. This ensures it will always be able to have some pages in flight, and so will not deadlock. In a steady-state, it is expected that PF_LOCAL_THROTTLE tasks might still be throttled by global threshold, but that is acceptable as it is only the deadlock state that is interesting for this flag. This approach of "only throttle when target bdi is busy" is consistent with the other use of PF_LESS_THROTTLE in current_may_throttle(), were it causes attention to be focussed only on the target bdi. So this patch - renames PF_LESS_THROTTLE to PF_LOCAL_THROTTLE, - removes the 25% bonus that that flag gives, and - If PF_LOCAL_THROTTLE is set, don't delay at all unless the global and the local free-run thresholds are exceeded. Note that previously realtime threads were treated the same as PF_LESS_THROTTLE threads. This patch does *not* change the behvaiour for real-time threads, so it is now different from the behaviour of nfsd and loop tasks. I don't know what is wanted for realtime. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: NeilBrown Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Acked-by: Chuck Lever [nfsd] Cc: Christoph Hellwig Cc: Michal Hocko Cc: Trond Myklebust Link: http://lkml.kernel.org/r/87ftbf7gs3.fsf@notabene.neil.brown.name Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..180a2fa33f7f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2262,7 +2262,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) -- cgit v1.2.3 From 515e5b6d90d410a3b0b433853c367936830a45a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:32 -0700 Subject: dma-mapping: use vmap insted of reimplementing it Replace the open coded instance of vmap with the actual function. In the non-contiguous (IOMMU) case this requires an extra find_vm_area, but given that this isn't a fast path function that is a small price to pay. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-6-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/dma/remap.c | 48 ++++++++++++------------------------------------ 1 file changed, 12 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..914ff5a58dd5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -20,23 +20,6 @@ struct page **dma_common_find_pages(void *cpu_addr) return area->pages; } -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, pgprot_t prot, const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - /* * Remaps an array of PAGE_SIZE pages into another vm_area. * Cannot be used in non-sleeping contexts @@ -44,15 +27,12 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { - struct vm_struct *area; + void *vaddr; - area = __dma_common_pages_remap(pages, size, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; + vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + if (vaddr) + find_vm_area(vaddr)->pages = pages; + return vaddr; } /* @@ -62,24 +42,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int i; + int count = size >> PAGE_SHIFT; struct page **pages; - struct vm_struct *area; + void *vaddr; + int i; - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) + for (i = 0; i < count; i++) pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, prot, caller); - + vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kfree(pages); - if (!area) - return NULL; - return area->addr; + return vaddr; } /* -- cgit v1.2.3 From 88dca4ca5a93d2c09e5bbc6a62fbfc3af83c4fca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:40 -0700 Subject: mm: remove the pgprot argument to __vmalloc The pgprot argument to __vmalloc is always PAGE_KERNEL now, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Michael Kelley [hyperv] Acked-by: Gao Xiang [erofs] Acked-by: Peter Zijlstra (Intel) Acked-by: Wei Liu Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-22-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/bpf/core.c | 6 +++--- kernel/groups.c | 2 +- kernel/module.c | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 14aa1f74dd10..cf6fe9107f5c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..086618a0058f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2946,8 +2946,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; -- cgit v1.2.3 From 2b9059489c839e67ca9254913325e18cea11a980 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:53 -0700 Subject: mm: remove __vmalloc_node_flags_caller Just use __vmalloc_node instead which gets and extra argument. To be able to to use __vmalloc_node in all caller make it available outside of vmalloc and implement it in nommu.c. [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200414131348.444715-25-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/bpf/syscall.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e6dee19a668..9c1cf7a87fb3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -299,9 +299,8 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags); } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, + numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) -- cgit v1.2.3 From 041de93ff86fc500aa73e5360039c95f4d31b95f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:52:02 -0700 Subject: mm: remove vmalloc_user_node_flags Open code it in __bpf_map_area_alloc, which is the only caller. Also clean up __bpf_map_area_alloc to have a single vmalloc call with slightly different flags instead of the current two different calls. For this to compile for the nommu case add a __vmalloc_node_range stub to nommu.c. [akpm@linux-foundation.org: fix nommu.c build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200414131348.444715-27-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/bpf/syscall.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9c1cf7a87fb3..42c7a42fc9c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -281,26 +282,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, - numa_node, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) -- cgit v1.2.3 From 73f693c3a705756032c2863bfb37570276902d7d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:36 -0700 Subject: mm: remove vmalloc_sync_(un)mappings() These functions are not needed anymore because the vmalloc and ioremap mappings are now synchronized when they are created or torn down. Remove all callers and function definitions. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Tested-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-7-joro@8bytes.org Signed-off-by: Linus Torvalds --- kernel/notifier.c | 1 - kernel/trace/trace.c | 12 ------------ 2 files changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..f12e99b387b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8527,18 +8527,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } -- cgit v1.2.3 From b7e4b65f3fe92abbf4a1f57987a54c820969aebd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 May 2020 00:16:31 -0400 Subject: bpf: make bpf_check_uarg_tail_zero() use check_zeroed_user() ... rather than open-coding it, and badly, at that. Acked-by: Alexei Starovoitov Signed-off-by: Al Viro --- kernel/bpf/syscall.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64783da34202..41ba746ecbc2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -67,32 +67,19 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, size_t actual_size) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - int err; + unsigned char __user *addr = uaddr + expected_size; + int res; if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; - if (unlikely(!access_ok(uaddr, actual_size))) - return -EFAULT; - if (actual_size <= expected_size) return 0; - addr = uaddr + expected_size; - end = uaddr + actual_size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - - return 0; + res = check_zeroed_user(addr, actual_size - expected_size); + if (res < 0) + return res; + return res ? 0 : -E2BIG; } const struct bpf_map_ops bpf_map_offload_ops = { -- cgit v1.2.3 From 305dacf77952e6e62405f916654f38423c78ab2f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:31 -0700 Subject: padata: remove exit routine Patch series "padata: parallelize deferred page init", v3. Deferred struct page init is a bottleneck in kernel boot--the biggest for us and probably others. Optimizing it maximizes availability for large-memory systems and allows spinning up short-lived VMs as needed without having to leave them running. It also benefits bare metal machines hosting VMs that are sensitive to downtime. In projects such as VMM Fast Restart[1], where guest state is preserved across kexec reboot, it helps prevent application and network timeouts in the guests. So, multithread deferred init to take full advantage of system memory bandwidth. Extend padata, a framework that handles many parallel singlethreaded jobs, to handle multithreaded jobs as well by adding support for splitting up the work evenly, specifying a minimum amount of work that's appropriate for one helper thread to do, load balancing between helpers, and coordinating them. More documentation in patches 4 and 8. This series is the first step in a project to address other memory proportional bottlenecks in the kernel such as pmem struct page init, vfio page pinning, hugetlb fallocate, and munmap. Deferred page init doesn't require concurrency limits, resource control, or priority adjustments like these other users will because it happens during boot when the system is otherwise idle and waiting for page init to finish. This has been run on a variety of x86 systems and speeds up kernel boot by 4% to 49%, saving up to 1.6 out of 4 seconds. Patch 6 has more numbers. This patch (of 8): padata_driver_exit() is unnecessary because padata isn't built as a module and doesn't exit. padata's init routine will soon allocate memory, so getting rid of the exit function now avoids pointless code to free it. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-1-daniel.m.jordan@oracle.com Link: http://lkml.kernel.org/r/20200527173608.2885243-2-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- kernel/padata.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index aae789896616..b4c7e3e38a05 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1074,10 +1074,4 @@ static __init int padata_driver_init(void) } module_init(padata_driver_init); -static __exit void padata_driver_exit(void) -{ - cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); - cpuhp_remove_multi_state(hp_online); -} -module_exit(padata_driver_exit); #endif -- cgit v1.2.3 From f1b192b117cd418bacf42a9583d7a01855a18fe5 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:35 -0700 Subject: padata: initialize earlier padata will soon initialize the system's struct pages in parallel, so it needs to be ready by page_alloc_init_late(). The error return from padata_driver_init() triggers an initcall warning, so add a warning to padata_init() to avoid silent failure. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-3-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- kernel/padata.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b4c7e3e38a05..0c5f4f78f2c2 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -31,7 +31,6 @@ #include #include #include -#include #define MAX_OBJ_NUM 1000 @@ -1052,26 +1051,26 @@ void padata_free_shell(struct padata_shell *ps) } EXPORT_SYMBOL(padata_free_shell); -#ifdef CONFIG_HOTPLUG_CPU - -static __init int padata_driver_init(void) +void __init padata_init(void) { +#ifdef CONFIG_HOTPLUG_CPU int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", padata_cpu_online, NULL); if (ret < 0) - return ret; + goto err; hp_online = ret; ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); if (ret < 0) { cpuhp_remove_multi_state(hp_online); - return ret; + goto err; } - return 0; -} -module_init(padata_driver_init); + return; +err: + pr_warn("padata: initialization failed\n"); #endif +} -- cgit v1.2.3 From 4611ce22468895acd61fee9ac1da810d60617d9a Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:39 -0700 Subject: padata: allocate work structures for parallel jobs from a pool padata allocates per-CPU, per-instance work structs for parallel jobs. A do_parallel call assigns a job to a sequence number and hashes the number to a CPU, where the job will eventually run using the corresponding work. This approach fit with how padata used to bind a job to each CPU round-robin, makes less sense after commit bfde23ce200e6 ("padata: unbind parallel jobs from specific CPUs") because a work isn't bound to a particular CPU anymore, and isn't needed at all for multithreaded jobs because they don't have sequence numbers. Replace the per-CPU works with a preallocated pool, which allows sharing them between existing padata users and the upcoming multithreaded user. The pool will also facilitate setting NUMA-aware concurrency limits with later users. The pool is sized according to the number of possible CPUs. With this limit, MAX_OBJ_NUM no longer makes sense, so remove it. If the global pool is exhausted, a parallel job is run in the current task instead to throttle a system trying to do too much in parallel. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-4-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- kernel/padata.c | 118 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 0c5f4f78f2c2..d779a73d4d92 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -32,7 +32,15 @@ #include #include -#define MAX_OBJ_NUM 1000 +struct padata_work { + struct work_struct pw_work; + struct list_head pw_list; /* padata_free_works linkage */ + void *pw_data; +}; + +static DEFINE_SPINLOCK(padata_works_lock); +static struct padata_work *padata_works; +static LIST_HEAD(padata_free_works); static void padata_free_pd(struct parallel_data *pd); @@ -58,30 +66,44 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *parallel_work) +static struct padata_work *padata_work_alloc(void) { - struct padata_parallel_queue *pqueue; - LIST_HEAD(local_list); + struct padata_work *pw; - local_bh_disable(); - pqueue = container_of(parallel_work, - struct padata_parallel_queue, work); + lockdep_assert_held(&padata_works_lock); - spin_lock(&pqueue->parallel.lock); - list_replace_init(&pqueue->parallel.list, &local_list); - spin_unlock(&pqueue->parallel.lock); + if (list_empty(&padata_free_works)) + return NULL; /* No more work items allowed to be queued. */ - while (!list_empty(&local_list)) { - struct padata_priv *padata; + pw = list_first_entry(&padata_free_works, struct padata_work, pw_list); + list_del(&pw->pw_list); + return pw; +} - padata = list_entry(local_list.next, - struct padata_priv, list); +static void padata_work_init(struct padata_work *pw, work_func_t work_fn, + void *data) +{ + INIT_WORK(&pw->pw_work, work_fn); + pw->pw_data = data; +} - list_del_init(&padata->list); +static void padata_work_free(struct padata_work *pw) +{ + lockdep_assert_held(&padata_works_lock); + list_add(&pw->pw_list, &padata_free_works); +} - padata->parallel(padata); - } +static void padata_parallel_worker(struct work_struct *parallel_work) +{ + struct padata_work *pw = container_of(parallel_work, struct padata_work, + pw_work); + struct padata_priv *padata = pw->pw_data; + local_bh_disable(); + padata->parallel(padata); + spin_lock(&padata_works_lock); + padata_work_free(pw); + spin_unlock(&padata_works_lock); local_bh_enable(); } @@ -105,9 +127,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, target_cpu, err; - struct padata_parallel_queue *queue; + int i, cpu, cpu_index, err; struct parallel_data *pd; + struct padata_work *pw; rcu_read_lock_bh(); @@ -135,25 +157,25 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) - goto out; - - err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = *cb_cpu; - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - target_cpu = padata_cpu_hash(pd, padata->seq_nr); - padata->cpu = target_cpu; - queue = per_cpu_ptr(pd->pqueue, target_cpu); - - spin_lock(&queue->parallel.lock); - list_add_tail(&padata->list, &queue->parallel.list); - spin_unlock(&queue->parallel.lock); + rcu_read_unlock_bh(); - queue_work(pinst->parallel_wq, &queue->work); + spin_lock(&padata_works_lock); + padata->seq_nr = ++pd->seq_nr; + pw = padata_work_alloc(); + spin_unlock(&padata_works_lock); + if (pw) { + padata_work_init(pw, padata_parallel_worker, padata); + queue_work(pinst->parallel_wq, &pw->pw_work); + } else { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + return 0; out: rcu_read_unlock_bh(); @@ -324,8 +346,9 @@ static void padata_serial_worker(struct work_struct *serial_work) void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; + int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - padata->cpu); + hashed_cpu); struct padata_priv *cur; spin_lock(&pqueue->reorder.lock); @@ -416,8 +439,6 @@ static void padata_init_pqueues(struct parallel_data *pd) pqueue = per_cpu_ptr(pd->pqueue, cpu); __padata_list_init(&pqueue->reorder); - __padata_list_init(&pqueue->parallel); - INIT_WORK(&pqueue->work, padata_parallel_worker); atomic_set(&pqueue->num_obj, 0); } } @@ -451,7 +472,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_pqueues(pd); padata_init_squeues(pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); @@ -1053,6 +1074,7 @@ EXPORT_SYMBOL(padata_free_shell); void __init padata_init(void) { + unsigned int i, possible_cpus; #ifdef CONFIG_HOTPLUG_CPU int ret; @@ -1064,13 +1086,27 @@ void __init padata_init(void) ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); - if (ret < 0) { - cpuhp_remove_multi_state(hp_online); - goto err; - } + if (ret < 0) + goto remove_online_state; +#endif + + possible_cpus = num_possible_cpus(); + padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work), + GFP_KERNEL); + if (!padata_works) + goto remove_dead_state; + + for (i = 0; i < possible_cpus; ++i) + list_add(&padata_works[i].pw_list, &padata_free_works); return; + +remove_dead_state: +#ifdef CONFIG_HOTPLUG_CPU + cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); +remove_online_state: + cpuhp_remove_multi_state(hp_online); err: - pr_warn("padata: initialization failed\n"); #endif + pr_warn("padata: initialization failed\n"); } -- cgit v1.2.3 From 004ed42638f4428e70ead59d170f3d17ff761a0f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:43 -0700 Subject: padata: add basic support for multithreaded jobs Sometimes the kernel doesn't take full advantage of system memory bandwidth, leading to a single CPU spending excessive time in initialization paths where the data scales with memory size. Multithreading naturally addresses this problem. Extend padata, a framework that handles many parallel yet singlethreaded jobs, to also handle multithreaded jobs by adding support for splitting up the work evenly, specifying a minimum amount of work that's appropriate for one helper thread to do, load balancing between helpers, and coordinating them. This is inspired by work from Pavel Tatashin and Steve Sistare. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-5-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- kernel/padata.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 149 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index d779a73d4d92..29fc5d87a4cd 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -7,6 +7,9 @@ * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan + * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. @@ -21,6 +24,7 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ +#include #include #include #include @@ -32,6 +36,8 @@ #include #include +#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */ + struct padata_work { struct work_struct pw_work; struct list_head pw_list; /* padata_free_works linkage */ @@ -42,7 +48,17 @@ static DEFINE_SPINLOCK(padata_works_lock); static struct padata_work *padata_works; static LIST_HEAD(padata_free_works); +struct padata_mt_job_state { + spinlock_t lock; + struct completion completion; + struct padata_mt_job *job; + int nworks; + int nworks_fini; + unsigned long chunk_size; +}; + static void padata_free_pd(struct parallel_data *pd); +static void __init padata_mt_helper(struct work_struct *work); static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { @@ -81,18 +97,56 @@ static struct padata_work *padata_work_alloc(void) } static void padata_work_init(struct padata_work *pw, work_func_t work_fn, - void *data) + void *data, int flags) { - INIT_WORK(&pw->pw_work, work_fn); + if (flags & PADATA_WORK_ONSTACK) + INIT_WORK_ONSTACK(&pw->pw_work, work_fn); + else + INIT_WORK(&pw->pw_work, work_fn); pw->pw_data = data; } +static int __init padata_work_alloc_mt(int nworks, void *data, + struct list_head *head) +{ + int i; + + spin_lock(&padata_works_lock); + /* Start at 1 because the current task participates in the job. */ + for (i = 1; i < nworks; ++i) { + struct padata_work *pw = padata_work_alloc(); + + if (!pw) + break; + padata_work_init(pw, padata_mt_helper, data, 0); + list_add(&pw->pw_list, head); + } + spin_unlock(&padata_works_lock); + + return i; +} + static void padata_work_free(struct padata_work *pw) { lockdep_assert_held(&padata_works_lock); list_add(&pw->pw_list, &padata_free_works); } +static void __init padata_works_free(struct list_head *works) +{ + struct padata_work *cur, *next; + + if (list_empty(works)) + return; + + spin_lock(&padata_works_lock); + list_for_each_entry_safe(cur, next, works, pw_list) { + list_del(&cur->pw_list); + padata_work_free(cur); + } + spin_unlock(&padata_works_lock); +} + static void padata_parallel_worker(struct work_struct *parallel_work) { struct padata_work *pw = container_of(parallel_work, struct padata_work, @@ -168,7 +222,7 @@ int padata_do_parallel(struct padata_shell *ps, pw = padata_work_alloc(); spin_unlock(&padata_works_lock); if (pw) { - padata_work_init(pw, padata_parallel_worker, padata); + padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); } else { /* Maximum works limit exceeded, run in the current task. */ @@ -409,6 +463,98 @@ out: return err; } +static void __init padata_mt_helper(struct work_struct *w) +{ + struct padata_work *pw = container_of(w, struct padata_work, pw_work); + struct padata_mt_job_state *ps = pw->pw_data; + struct padata_mt_job *job = ps->job; + bool done; + + spin_lock(&ps->lock); + + while (job->size > 0) { + unsigned long start, size, end; + + start = job->start; + /* So end is chunk size aligned if enough work remains. */ + size = roundup(start + 1, ps->chunk_size) - start; + size = min(size, job->size); + end = start + size; + + job->start = end; + job->size -= size; + + spin_unlock(&ps->lock); + job->thread_fn(start, end, job->fn_arg); + spin_lock(&ps->lock); + } + + ++ps->nworks_fini; + done = (ps->nworks_fini == ps->nworks); + spin_unlock(&ps->lock); + + if (done) + complete(&ps->completion); +} + +/** + * padata_do_multithreaded - run a multithreaded job + * @job: Description of the job. + * + * See the definition of struct padata_mt_job for more details. + */ +void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + /* In case threads finish at different times. */ + static const unsigned long load_balance_factor = 4; + struct padata_work my_work, *pw; + struct padata_mt_job_state ps; + LIST_HEAD(works); + int nworks; + + if (job->size == 0) + return; + + /* Ensure at least one thread when size < min_chunk. */ + nworks = max(job->size / job->min_chunk, 1ul); + nworks = min(nworks, job->max_threads); + + if (nworks == 1) { + /* Single thread, no coordination needed, cut to the chase. */ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); + return; + } + + spin_lock_init(&ps.lock); + init_completion(&ps.completion); + ps.job = job; + ps.nworks = padata_work_alloc_mt(nworks, &ps, &works); + ps.nworks_fini = 0; + + /* + * Chunk size is the amount of work a helper does per call to the + * thread function. Load balance large jobs between threads by + * increasing the number of chunks, guarantee at least the minimum + * chunk size from the caller, and honor the caller's alignment. + */ + ps.chunk_size = job->size / (ps.nworks * load_balance_factor); + ps.chunk_size = max(ps.chunk_size, job->min_chunk); + ps.chunk_size = roundup(ps.chunk_size, job->align); + + list_for_each_entry(pw, &works, pw_list) + queue_work(system_unbound_wq, &pw->pw_work); + + /* Use the current thread, which saves starting a workqueue worker. */ + padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); + padata_mt_helper(&my_work.pw_work); + + /* Wait for all the helpers to finish. */ + wait_for_completion(&ps.completion); + + destroy_work_on_stack(&my_work.pw_work); + padata_works_free(&works); +} + static void __padata_list_init(struct padata_list *pd_list) { INIT_LIST_HEAD(&pd_list->list); -- cgit v1.2.3 From 3fba69a56e16e8dcf182fe6ca77735dd65a898aa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:31 -0700 Subject: mm: memcontrol: drop @compound parameter from memcg charging API The memcg charging API carries a boolean @compound parameter that tells whether the page we're dealing with is a hugepage. mem_cgroup_commit_charge() has another boolean @lrucare that indicates whether the page needs LRU locking or not while charging. The majority of callsites know those parameters at compile time, which results in a lot of naked "false, false" argument lists. This makes for cryptic code and is a breeding ground for subtle mistakes. Thankfully, the huge page state can be inferred from the page itself and doesn't need to be passed along. This is safe because charging completes before the page is published and somebody may split it. Simplify the callsites by removing @compound, and let memcg infer the state by using hpage_nr_pages() unconditionally. That function does PageTransHuge() to identify huge pages, which also helpfully asserts that nobody passes in tail pages by accident. The following patches will introduce a new charging API, best not to carry over unnecessary weight. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Alex Shi Reviewed-by: Joonsoo Kim Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-4-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ece7e13f6e4a..40e7488ce467 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -169,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false); + &memcg); if (err) return err; } @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { if (new_page) - mem_cgroup_cancel_charge(new_page, memcg, false); + mem_cgroup_cancel_charge(new_page, memcg); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); @@ -189,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); + mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ -- cgit v1.2.3 From be5d0a74c62d8da43f9526a5b08cdd18e2bbc37a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:57 -0700 Subject: mm: memcontrol: switch to native NR_ANON_MAPPED counter Memcg maintains a private MEMCG_RSS counter. This divergence from the generic VM accounting means unnecessary code overhead, and creates a dependency for memcg that page->mapping is set up at the time of charging, so that page types can be told apart. Convert the generic accounting sites to mod_lruvec_page_state and friends to maintain the per-cgroup vmstat counter of NR_ANON_MAPPED. We use lock_page_memcg() to stabilize page->mem_cgroup during rmap changes, the same way we do for NR_FILE_MAPPED. With the previous patch removing MEMCG_CACHE and the private NR_SHMEM counter, this patch finally eliminates the need to have page->mapping set up at charge time. However, we need to have page->mem_cgroup set up by the time rmap runs and does the accounting, so switch the commit and the rmap callbacks around. v2: fix temporary accounting bug by switching rmap<->commit (Joonsoo) Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Alex Shi Cc: Hugh Dickins Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-11-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 40e7488ce467..89ef81b65bcb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -188,8 +188,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, addr, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ -- cgit v1.2.3 From 9d82c69438d0dff8809061edbcce43a5a4bcf09f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:04 -0700 Subject: mm: memcontrol: convert anon and file-thp to new mem_cgroup_charge() API With the page->mapping requirement gone from memcg, we can charge anon and file-thp pages in one single step, right after they're allocated. This removes two out of three API calls - especially the tricky commit step that needed to happen at just the right time between when the page is "set up" and when it's "published" - somewhat vague and fluid concepts that varied by page type. All we need is a freshly allocated page and a memcg context to charge. v2: prevent double charges on pre-allocated hugepages in khugepaged [hannes@cmpxchg.org: Fix crash - *hpage could be ERR_PTR instead of NULL] Link: http://lkml.kernel.org/r/20200512215813.GA487759@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Cc: Qian Cai Link: http://lkml.kernel.org/r/20200508183105.225460-13-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 89ef81b65bcb..4253c153e985 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -162,14 +162,13 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, }; int err; struct mmu_notifier_range range; - struct mem_cgroup *memcg; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL, + false); if (err) return err; } @@ -179,16 +178,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) { - if (new_page) - mem_cgroup_cancel_charge(new_page, memcg); + if (!page_vma_mapped_walk(&pvmw)) goto unlock; - } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { get_page(new_page); - mem_cgroup_commit_charge(new_page, memcg, false); page_add_new_anon_rmap(new_page, vma, addr, false); lru_cache_add_active_or_unevictable(new_page, vma); } else -- cgit v1.2.3 From d9eb1ea2bf8734afd8ec7d995270437a7242f82b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:24 -0700 Subject: mm: memcontrol: delete unused lrucare handling Swapin faults were the last event to charge pages after they had already been put on the LRU list. Now that we charge directly on swapin, the lrucare portion of the charge code is unused. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Balbir Singh Cc: Shakeel Butt Link: http://lkml.kernel.org/r/20200508183105.225460-19-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4253c153e985..eddc8db96027 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,8 +167,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL, - false); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); if (err) return err; } -- cgit v1.2.3 From c843966c556d7370bb32e7319a6d164cb8c70ae2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:37 -0700 Subject: mm: allow swappiness that prefers reclaiming anon over the file workingset With the advent of fast random IO devices (SSDs, PMEM) and in-memory swap devices such as zswap, it's possible for swap to be much faster than filesystems, and for swapping to be preferable over thrashing filesystem caches. Allow setting swappiness - which defines the rough relative IO cost of cache misses between page cache and swap-backed pages - to reflect such situations by making the swap-preferred range configurable. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Link: http://lkml.kernel.org/r/20200520232525.798933-4-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..7f15d292e44c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -131,6 +131,7 @@ static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; static int one_hundred = 100; +static int two_hundred = 200; static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; @@ -1391,7 +1392,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = &two_hundred, }, #ifdef CONFIG_HUGETLB_PAGE { -- cgit v1.2.3 From 333ed74689b8fca097574124fef7fa0e3d7f79d4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 3 Jun 2020 12:16:37 +0100 Subject: scs: Report SCS usage in bytes rather than number of entries Fix the SCS debug usage check so that we report the number of bytes used, rather than the number of entries. Fixes: 5bbaf9d1fcb9 ("scs: Add support for stack usage debugging") Reported-by: Sami Tolvanen Reviewed-by: Kees Cook Signed-off-by: Will Deacon --- kernel/scs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scs.c b/kernel/scs.c index 222a7a9ad543..5d4d9bbdec36 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -74,7 +74,7 @@ static void scs_check_usage(struct task_struct *tsk) for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { if (!READ_ONCE_NOCHECK(*p)) break; - used++; + used += sizeof(*p); } while (used > curr) { -- cgit v1.2.3 From e7ed83d6fa1a00d0f2ad0327e73d3ea9e7ea8de1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Jun 2020 11:54:36 +0300 Subject: bpf: Fix an error code in check_btf_func() This code returns success if the "info_aux" allocation fails but it should return -ENOMEM. Fixes: 8c1b6e69dcc1 ("bpf: Compare BTF types of functions arguments with actual types") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200604085436.GA943001@mwanda --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c7bbaac81ef..34cde841ab68 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7552,7 +7552,7 @@ static int check_btf_func(struct bpf_verifier_env *env, const struct btf *btf; void __user *urecord; u32 prev_offset = 0; - int ret = 0; + int ret = -ENOMEM; nfuncs = attr->func_info_cnt; if (!nfuncs) -- cgit v1.2.3 From 3c61df3885e91f8737bbbbaba79b908da0e1919f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:48 -0700 Subject: kcov: cleanup debug messages Patch series "kcov: collect coverage from usb soft interrupts", v4. This patchset extends kcov to allow collecting coverage from soft interrupts and then uses the new functionality to collect coverage from USB code. This has allowed to find at least one new HID bug [1], which was recently fixed by Alan [2]. [1] https://syzkaller.appspot.com/bug?extid=09ef48aa58261464b621 [2] https://patchwork.kernel.org/patch/11283319/ Any subsystem that uses softirqs (e.g. timers) can make use of this in the future. Looking at the recent syzbot reports, an obvious candidate is the networking subsystem [3, 4, 5 and many more]. [3] https://syzkaller.appspot.com/bug?extid=522ab502c69badc66ab7 [4] https://syzkaller.appspot.com/bug?extid=57f89d05946c53dbbb31 [5] https://syzkaller.appspot.com/bug?extid=df358e65d9c1b9d3f5f4 This pach (of 7): Previous commit left a lot of excessive debug messages, clean them up. Link; http://lkml.kernel.org/r/cover.1585233617.git.andreyknvl@google.com Link; http://lkml.kernel.org/r/ab5e2885ce674ba6e04368551e51eeb6a2c11baf.1585233617.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Alexander Potapenko Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/4a497134b2cf7a9d306d28e3dd2746f5446d1605.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 8accc9722a81..e6bb2b50569f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -98,6 +98,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle) return NULL; } +/* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) { struct kcov_remote *remote; @@ -119,16 +120,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) struct kcov_remote_area *area; struct list_head *pos; - kcov_debug("size = %u\n", size); list_for_each(pos, &kcov_remote_areas) { area = list_entry(pos, struct kcov_remote_area, list); if (area->size == size) { list_del(&area->list); - kcov_debug("rv = %px\n", area); return area; } } - kcov_debug("rv = NULL\n"); return NULL; } @@ -136,7 +134,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) static void kcov_remote_area_put(struct kcov_remote_area *area, unsigned int size) { - kcov_debug("area = %px, size = %u\n", area, size); INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); @@ -366,7 +363,6 @@ static void kcov_remote_reset(struct kcov *kcov) hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; - kcov_debug("removing handle %llx\n", remote->handle); hash_del(&remote->hnode); kfree(remote); } @@ -553,7 +549,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, switch (cmd) { case KCOV_INIT_TRACE: - kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -572,7 +567,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: - kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -598,7 +592,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_get(kcov); return 0; case KCOV_DISABLE: - kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -610,7 +603,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_put(kcov); return 0; case KCOV_REMOTE_ENABLE: - kcov_debug("KCOV_REMOTE_ENABLE\n"); if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; t = current; @@ -629,7 +621,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->remote_size = remote_arg->area_size; spin_lock(&kcov_remote_lock); for (i = 0; i < remote_arg->num_handles; i++) { - kcov_debug("handle %llx\n", remote_arg->handles[i]); if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { spin_unlock(&kcov_remote_lock); @@ -644,8 +635,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, } } if (remote_arg->common_handle) { - kcov_debug("common handle %llx\n", - remote_arg->common_handle); if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { spin_unlock(&kcov_remote_lock); @@ -782,7 +771,6 @@ void kcov_remote_start(u64 handle) spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - kcov_debug("no remote found"); spin_unlock(&kcov_remote_lock); return; } @@ -810,8 +798,6 @@ void kcov_remote_start(u64 handle) /* Reset coverage size. */ *(u64 *)area = 0; - kcov_debug("area = %px, size = %u", area, size); - kcov_start(t, size, area, mode, sequence); } @@ -881,10 +867,8 @@ void kcov_remote_stop(void) unsigned int size = t->kcov_size; int sequence = t->kcov_sequence; - if (!kcov) { - kcov_debug("no kcov found\n"); + if (!kcov) return; - } kcov_stop(t); t->kcov = NULL; @@ -894,8 +878,6 @@ void kcov_remote_stop(void) * KCOV_DISABLE could have been called between kcov_remote_start() * and kcov_remote_stop(), hence the check. */ - kcov_debug("move if: %d == %d && %d\n", - sequence, kcov->sequence, (int)kcov->remote); if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); -- cgit v1.2.3 From 67b3d3cca385507c4c8b6ad97b823415e038e3c8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:51 -0700 Subject: kcov: fix potential use-after-free in kcov_remote_start If vmalloc() fails in kcov_remote_start() we'll access remote->kcov without holding kcov_remote_lock, so remote might potentially be freed at that point. Cache kcov pointer in a local variable. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/9d9134359725a965627b7e8f2652069f86f1d1fa.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/de0d3d30ff90776a2a509cc34c7c1c7521bda125.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index e6bb2b50569f..14e7208c5291 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -748,6 +748,7 @@ static const struct file_operations kcov_fops = { void kcov_remote_start(u64 handle) { struct kcov_remote *remote; + struct kcov *kcov; void *area; struct task_struct *t; unsigned int size; @@ -774,16 +775,17 @@ void kcov_remote_start(u64 handle) spin_unlock(&kcov_remote_lock); return; } + kcov = remote->kcov; /* Put in kcov_remote_stop(). */ - kcov_get(remote->kcov); - t->kcov = remote->kcov; + kcov_get(kcov); + t->kcov = kcov; /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = remote->kcov->remote_size; - mode = remote->kcov->mode; - sequence = remote->kcov->sequence; + size = kcov->remote_size; + mode = kcov->mode; + sequence = kcov->sequence; area = kcov_remote_area_get(size); spin_unlock(&kcov_remote_lock); @@ -791,7 +793,7 @@ void kcov_remote_start(u64 handle) area = vmalloc(size * sizeof(unsigned long)); if (!area) { t->kcov = NULL; - kcov_put(remote->kcov); + kcov_put(kcov); return; } } -- cgit v1.2.3 From 76484b1c77242b737f8fd001d6e00af7518221f3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:55 -0700 Subject: kcov: move t->kcov assignments into kcov_start/stop Every time kcov_start/stop() is called, t->kcov is also assigned, so move the assignment into the functions. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/6644839d3567df61ade3c4b246a46cacbe4f9e11.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/82625ef3ff878f0b585763cc31d09d9b08ca37d6.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 14e7208c5291..96dbc198d166 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -309,10 +309,12 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ -static void kcov_start(struct task_struct *t, unsigned int size, - void *area, enum kcov_mode mode, int sequence) +static void kcov_start(struct task_struct *t, struct kcov *kcov, + unsigned int size, void *area, enum kcov_mode mode, + int sequence) { kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + t->kcov = kcov; /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; @@ -326,6 +328,7 @@ static void kcov_stop(struct task_struct *t) { WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); barrier(); + t->kcov = NULL; t->kcov_size = 0; t->kcov_area = NULL; } @@ -333,7 +336,6 @@ static void kcov_stop(struct task_struct *t) static void kcov_task_reset(struct task_struct *t) { kcov_stop(t); - t->kcov = NULL; t->kcov_sequence = 0; t->kcov_handle = 0; } @@ -584,9 +586,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return mode; kcov_fault_in_area(kcov); kcov->mode = mode; - kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode, kcov->sequence); - t->kcov = kcov; kcov->t = t; /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); @@ -778,7 +779,6 @@ void kcov_remote_start(u64 handle) kcov = remote->kcov; /* Put in kcov_remote_stop(). */ kcov_get(kcov); - t->kcov = kcov; /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). @@ -792,7 +792,6 @@ void kcov_remote_start(u64 handle) if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { - t->kcov = NULL; kcov_put(kcov); return; } @@ -800,7 +799,7 @@ void kcov_remote_start(u64 handle) /* Reset coverage size. */ *(u64 *)area = 0; - kcov_start(t, size, area, mode, sequence); + kcov_start(t, kcov, size, area, mode, sequence); } EXPORT_SYMBOL(kcov_remote_start); @@ -873,7 +872,6 @@ void kcov_remote_stop(void) return; kcov_stop(t); - t->kcov = NULL; spin_lock(&kcov->lock); /* -- cgit v1.2.3 From eeb91f9a2e3e9766ae9fd1117bd19d87538f21bf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:58 -0700 Subject: kcov: move t->kcov_sequence assignment Move t->kcov_sequence assignment before assigning t->kcov_mode for consistency. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/5889efe35e0b300e69dba97216b1288d9c2428a8.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/f0283c676bab3335cb48bfe12d375a3da4719f59.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 96dbc198d166..7cd05bd1fada 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -318,10 +318,10 @@ static void kcov_start(struct task_struct *t, struct kcov *kcov, /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; + t->kcov_sequence = sequence; /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, mode); - t->kcov_sequence = sequence; } static void kcov_stop(struct task_struct *t) -- cgit v1.2.3 From 5fe7042dc0a2e80b4633df20dcd06b93e76e3c31 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:46:01 -0700 Subject: kcov: use t->kcov_mode as enabled indicator Currently kcov_remote_start() and kcov_remote_stop() check t->kcov to find out whether the coverage is already being collected by the current task. Use t->kcov_mode for that instead. This doesn't change the overall behavior in any way, but serves as a preparation for the following softirq coverage collection support patch. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/f70377945d1d8e6e4916cbce871a12303d6186b4.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/ee1a1dec43059da5d7664c85c1addc89c4cd58de.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 7cd05bd1fada..93b28ad2da28 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -746,26 +746,33 @@ static const struct file_operations kcov_fops = { * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ + +static inline bool kcov_mode_enabled(unsigned int mode) +{ + return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; +} + void kcov_remote_start(u64 handle) { + struct task_struct *t = current; struct kcov_remote *remote; struct kcov *kcov; + unsigned int mode; void *area; - struct task_struct *t; unsigned int size; - enum kcov_mode mode; int sequence; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; if (WARN_ON(!in_task())) return; - t = current; + /* * Check that kcov_remote_start is not called twice * nor called by user tasks (with enabled kcov). */ - if (WARN_ON(t->kcov)) + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(kcov_mode_enabled(mode))) return; kcov_debug("handle = %llx\n", handle); @@ -863,13 +870,20 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, void kcov_remote_stop(void) { struct task_struct *t = current; - struct kcov *kcov = t->kcov; - void *area = t->kcov_area; - unsigned int size = t->kcov_size; - int sequence = t->kcov_sequence; + struct kcov *kcov; + unsigned int mode; + void *area; + unsigned int size; + int sequence; - if (!kcov) + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) return; + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; kcov_stop(t); -- cgit v1.2.3 From 5ff3b30ab57da82d8db4f14662a2858cabfbc2c0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:46:04 -0700 Subject: kcov: collect coverage from interrupts This change extends kcov remote coverage support to allow collecting coverage from soft interrupts in addition to kernel background threads. To collect coverage from code that is executed in softirq context, a part of that code has to be annotated with kcov_remote_start/stop() in a similar way as how it is done for global kernel background threads. Then the handle used for the annotations has to be passed to the KCOV_REMOTE_ENABLE ioctl. Internally this patch adjusts the __sanitizer_cov_trace_pc() compiler inserted callback to not bail out when called from softirq context. kcov_remote_start/stop() are updated to save/restore the current per task kcov state in a per-cpu area (in case the softirq came when the kernel was already collecting coverage in task context). Coverage from softirqs is collected into pre-allocated per-cpu areas, whose size is controlled by the new CONFIG_KCOV_IRQ_AREA_SIZE. [andreyknvl@google.com: turn current->kcov_softirq into unsigned int to fix objtool warning] Link: http://lkml.kernel.org/r/841c778aa3849c5cb8c3761f56b87ce653a88671.1585233617.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Greg Kroah-Hartman Cc: Marco Elver Link: http://lkml.kernel.org/r/469bd385c431d050bc38a593296eff4baae50666.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 155 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 93b28ad2da28..55c5d883a93e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock); static DEFINE_HASHTABLE(kcov_remote_map, 4); static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); +struct kcov_percpu_data { + void *irq_area; + + unsigned int saved_mode; + unsigned int saved_size; + void *saved_area; + struct kcov *saved_kcov; + int saved_sequence; +}; + +DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); + /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) { @@ -145,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru /* * We are interested in code coverage as a function of a syscall inputs, - * so we ignore code executed in interrupts. + * so we ignore code executed in interrupts, unless we are in a remote + * coverage collection section in a softirq. */ - if (!in_task()) + if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -360,8 +373,9 @@ static void kcov_remote_reset(struct kcov *kcov) int bkt; struct kcov_remote *remote; struct hlist_node *tmp; + unsigned long flags; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; @@ -370,7 +384,7 @@ static void kcov_remote_reset(struct kcov *kcov) } /* Do reset before unlock to prevent races with kcov_remote_start(). */ kcov_reset(kcov); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); } static void kcov_disable(struct task_struct *t, struct kcov *kcov) @@ -399,12 +413,13 @@ static void kcov_put(struct kcov *kcov) void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; + unsigned long flags; kcov = t->kcov; if (kcov == NULL) return; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); /* * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, @@ -428,12 +443,12 @@ void kcov_task_exit(struct task_struct *t) * By combining all three checks into one we get: */ if (WARN_ON(kcov->t != t)) { - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); return; } /* Just to not leave dangling references behind. */ kcov_disable(t, kcov); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kcov_put(kcov); } @@ -444,12 +459,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; + unsigned long flags; area = vmalloc_user(vma->vm_end - vma->vm_start); if (!area) return -ENOMEM; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { @@ -459,7 +475,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) if (!kcov->area) { kcov->area = area; vma->vm_flags |= VM_DONTEXPAND; - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); if (vm_insert_page(vma, vma->vm_start + off, page)) @@ -468,7 +484,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) return 0; } exit: - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); vfree(area); return res; } @@ -548,6 +564,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; + unsigned long flags; switch (cmd) { case KCOV_INIT_TRACE: @@ -620,17 +637,19 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); for (i = 0; i < remote_arg->num_handles; i++) { if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->handles[i]); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } @@ -638,20 +657,22 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, if (remote_arg->common_handle) { if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->common_handle); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } t->kcov_handle = remote_arg->common_handle; } - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; @@ -667,6 +688,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; + unsigned long flags; if (cmd == KCOV_REMOTE_ENABLE) { if (get_user(remote_num_handles, (unsigned __user *)(arg + @@ -687,9 +709,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } kcov = filep->private_data; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kfree(remote_arg); @@ -706,8 +728,8 @@ static const struct file_operations kcov_fops = { /* * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section - * of code in a kernel background thread to allow kcov to be used to collect - * coverage from that part of code. + * of code in a kernel background thread or in a softirq to allow kcov to be + * used to collect coverage from that part of code. * * The handle argument of kcov_remote_start() identifies a code section that is * used for coverage collection. A userspace process passes this handle to @@ -718,9 +740,9 @@ static const struct file_operations kcov_fops = { * the type of the kernel thread whose code is being annotated. * * For global kernel threads that are spawned in a limited number of instances - * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each - * instance must be assigned a unique 4-byte instance id. The instance id is - * then combined with a 1-byte subsystem id to get a handle via + * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for + * softirqs, each instance must be assigned a unique 4-byte instance id. The + * instance id is then combined with a 1-byte subsystem id to get a handle via * kcov_remote_handle(subsystem_id, instance_id). * * For local kernel threads that are spawned from system calls handler when a @@ -739,7 +761,7 @@ static const struct file_operations kcov_fops = { * * See Documentation/dev-tools/kcov.rst for more details. * - * Internally, this function looks up the kcov device associated with the + * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to * be collected via __sanitizer_cov_trace_pc() @@ -752,6 +774,39 @@ static inline bool kcov_mode_enabled(unsigned int mode) return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; } +void kcov_remote_softirq_start(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + unsigned int mode; + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (kcov_mode_enabled(mode)) { + data->saved_mode = mode; + data->saved_size = t->kcov_size; + data->saved_area = t->kcov_area; + data->saved_sequence = t->kcov_sequence; + data->saved_kcov = t->kcov; + kcov_stop(t); + } +} + +void kcov_remote_softirq_stop(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + + if (data->saved_kcov) { + kcov_start(t, data->saved_kcov, data->saved_size, + data->saved_area, data->saved_mode, + data->saved_sequence); + data->saved_mode = 0; + data->saved_size = 0; + data->saved_area = NULL; + data->saved_sequence = 0; + data->saved_kcov = NULL; + } +} + void kcov_remote_start(u64 handle) { struct task_struct *t = current; @@ -761,28 +816,42 @@ void kcov_remote_start(u64 handle) void *area; unsigned int size; int sequence; + unsigned long flags; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (WARN_ON(!in_task())) + if (!in_task() && !in_serving_softirq()) return; + local_irq_save(flags); + /* - * Check that kcov_remote_start is not called twice - * nor called by user tasks (with enabled kcov). + * Check that kcov_remote_start() is not called twice in background + * threads nor called by user tasks (with enabled kcov). */ mode = READ_ONCE(t->kcov_mode); - if (WARN_ON(kcov_mode_enabled(mode))) + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { + local_irq_restore(flags); return; - - kcov_debug("handle = %llx\n", handle); + } + /* + * Check that kcov_remote_start() is not called twice in softirqs. + * Note, that kcov_remote_start() can be called from a softirq that + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); return; } + kcov_debug("handle = %llx, context: %s\n", handle, + in_task() ? "task" : "softirq"); kcov = remote->kcov; /* Put in kcov_remote_stop(). */ kcov_get(kcov); @@ -790,12 +859,18 @@ void kcov_remote_start(u64 handle) * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = kcov->remote_size; mode = kcov->mode; sequence = kcov->sequence; - area = kcov_remote_area_get(size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + size = kcov->remote_size; + area = kcov_remote_area_get(size); + } else { + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } + spin_unlock_irqrestore(&kcov_remote_lock, flags); + /* Can only happen when in_task(). */ if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { @@ -803,11 +878,20 @@ void kcov_remote_start(u64 handle) return; } } + + local_irq_save(flags); + /* Reset coverage size. */ *(u64 *)area = 0; + if (in_serving_softirq()) { + kcov_remote_softirq_start(t); + t->kcov_softirq = 1; + } kcov_start(t, kcov, size, area, mode, sequence); + local_irq_restore(flags); + } EXPORT_SYMBOL(kcov_remote_start); @@ -875,31 +959,53 @@ void kcov_remote_stop(void) void *area; unsigned int size; int sequence; + unsigned long flags; + + if (!in_task() && !in_serving_softirq()) + return; + + local_irq_save(flags); mode = READ_ONCE(t->kcov_mode); barrier(); - if (!kcov_mode_enabled(mode)) + if (!kcov_mode_enabled(mode)) { + local_irq_restore(flags); return; + } kcov = t->kcov; area = t->kcov_area; size = t->kcov_size; sequence = t->kcov_sequence; + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } + kcov_stop(t); + if (in_serving_softirq()) { + t->kcov_softirq = 0; + kcov_remote_softirq_stop(t); + } spin_lock(&kcov->lock); /* * KCOV_DISABLE could have been called between kcov_remote_start() - * and kcov_remote_stop(), hence the check. + * and kcov_remote_stop(), hence the sequence check. */ if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); - spin_lock(&kcov_remote_lock); - kcov_remote_area_put(area, size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + } + local_irq_restore(flags); + + /* Get in kcov_remote_start(). */ kcov_put(kcov); } EXPORT_SYMBOL(kcov_remote_stop); @@ -913,6 +1019,16 @@ EXPORT_SYMBOL(kcov_common_handle); static int __init kcov_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; + } + /* * The kcov debugfs file won't ever get removed and thus, * there is no need to protect it against removal races. The -- cgit v1.2.3 From 3fe4f4991a2a818277445bd5b8b289305b7dd15d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:44 -0700 Subject: kexec_file: don't place kexec images on IORESOURCE_MEM_DRIVER_MANAGED Memory flagged with IORESOURCE_MEM_DRIVER_MANAGED is special - it won't be part of the initial memmap of the kexec kernel and not all memory might be accessible. Don't place any kexec images onto it. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Pankaj Gupta Cc: Wei Yang Cc: Baoquan He Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Tatashin Cc: Dan Williams Link: http://lkml.kernel.org/r/20200508084217.9160-4-david@redhat.com Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index faa74d5f6941..bb05fd52de85 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -540,6 +540,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ + + /* Don't use memory that will be detected and handled by a driver. */ + if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + return 0; + if (sz < kbuf->memsz) return 0; -- cgit v1.2.3 From de83dbd97f173650a602c5e356025b732173ecc4 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 4 Jun 2020 16:49:58 -0700 Subject: user.c: make uidhash_table static Fix the following sparse warning: kernel/user.c:85:19: warning: symbol 'uidhash_table' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Andrew Morton Cc: David Howells Cc: Greg Kroah-Hartman Cc: Rasmus Villemoes Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200413082146.22737-1-yanaijie@huawei.com Signed-off-by: Linus Torvalds --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 5235d7f49982..b1635d94a1f2 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; -struct hlist_head uidhash_table[UIDHASH_SZ]; +static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is -- cgit v1.2.3 From eac2cece45074e372f78a459c7bb2d7207b72736 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 4 Jun 2020 16:51:11 -0700 Subject: kernel/kprobes.c: convert to use DEFINE_SEQ_ATTRIBUTE macro Use DEFINE_SEQ_ATTRIBUTE macro to simplify the code. Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Greg KH Cc: Ingo Molnar Cc: Al Viro Link: http://lkml.kernel.org/r/20200509064031.181091-4-wangkefeng.wang@huawei.com Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0fbdee78266b..50cd84f53df0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2475,24 +2475,14 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static const struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_sops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, .show = show_kprobe_addr }; -static int kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobes); /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) @@ -2529,24 +2519,13 @@ static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) mutex_unlock(&kprobe_mutex); } -static const struct seq_operations kprobe_blacklist_seq_ops = { +static const struct seq_operations kprobe_blacklist_sops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; - -static int kprobe_blacklist_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_blacklist_seq_ops); -} - -static const struct file_operations debugfs_kprobe_blacklist_ops = { - .open = kprobe_blacklist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist); static int arm_all_kprobes(void) { @@ -2705,13 +2684,12 @@ static int __init debugfs_kprobe_init(void) dir = debugfs_create_dir("kprobes", NULL); - debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); + debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); + &kprobe_blacklist_fops); return 0; } -- cgit v1.2.3 From 54e200ab40fc14c863bcc80a51e20b7906608fce Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 4 Jun 2020 16:51:27 -0700 Subject: kernel/relay.c: handle alloc_percpu returning NULL in relay_open alloc_percpu() may return NULL, which means chan->buf may be set to NULL. In that case, when we do *per_cpu_ptr(chan->buf, ...), we dereference an invalid pointer: BUG: Unable to handle kernel data access at 0x7dae0000 Faulting instruction address: 0xc0000000003f3fec ... NIP relay_open+0x29c/0x600 LR relay_open+0x270/0x600 Call Trace: relay_open+0x264/0x600 (unreliable) __blk_trace_setup+0x254/0x600 blk_trace_setup+0x68/0xa0 sg_ioctl+0x7bc/0x2e80 do_vfs_ioctl+0x13c/0x1300 ksys_ioctl+0x94/0x130 sys_ioctl+0x48/0xb0 system_call+0x5c/0x68 Check if alloc_percpu returns NULL. This was found by syzkaller both on x86 and powerpc, and the reproducer it found on powerpc is capable of hitting the issue as an unprivileged user. Fixes: 017c59c042d0 ("relay: Use per CPU constructs for the relay channel buffer pointers") Reported-by: syzbot+1e925b4b836afe85a1c6@syzkaller-ppc64.appspotmail.com Reported-by: syzbot+587b2421926808309d21@syzkaller-ppc64.appspotmail.com Reported-by: syzbot+58320b7171734bf79d26@syzkaller.appspotmail.com Reported-by: syzbot+d6074fb08bdb2e010520@syzkaller.appspotmail.com Signed-off-by: Daniel Axtens Signed-off-by: Andrew Morton Reviewed-by: Michael Ellerman Reviewed-by: Andrew Donnellan Acked-by: David Rientjes Cc: Akash Goel Cc: Andrew Donnellan Cc: Guenter Roeck Cc: Salvatore Bonaccorso Cc: [4.10+] Link: http://lkml.kernel.org/r/20191219121256.26480-1-dja@axtens.net Signed-off-by: Linus Torvalds --- kernel/relay.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 90c7a002436d..dc82705e1cff 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename, return NULL; chan->buf = alloc_percpu(struct rchan_buf *); + if (!chan->buf) { + kfree(chan); + return NULL; + } + chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; -- cgit v1.2.3 From 341a7213e5c1ce274cc0f02270054905800ea660 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Thu, 4 Jun 2020 16:51:30 -0700 Subject: kernel/relay.c: fix read_pos error when multiple readers When reading, read_pos should start with bytes_consumed, not file->f_pos. Because when there is more than one reader, the read_pos corresponding to file->f_pos may have been consumed, which will cause the data that has been consumed to be read and the bytes_consumed update error. Signed-off-by: Pengcheng Yang Signed-off-by: Andrew Morton Reviewed-by: Jens Axboe Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Al Viro e Link: http://lkml.kernel.org/r/1579691175-28949-1-git-send-email-yangpc@wangsu.com Signed-off-by: Linus Torvalds --- kernel/relay.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index dc82705e1cff..204867220f8a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -996,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf, /* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; size_t consumed = buf->subbufs_consumed; - relay_file_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, 0, 0); consumed = buf->subbufs_consumed; @@ -1064,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position * @buf: relay channel buffer * - * If the @read_pos is in the middle of padding, return the + * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; + size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; padding_start = (read_subbuf + 1) * subbuf_size - padding; @@ -1136,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp, do { void *from; - if (!relay_file_read_avail(buf, *ppos)) + if (!relay_file_read_avail(buf)) break; - read_start = relay_file_read_start_pos(*ppos, buf); + read_start = relay_file_read_start_pos(buf); avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) break; -- cgit v1.2.3 From d24de76af836260a99ca2ba281a937bd5bc55591 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Jun 2020 07:14:43 +0200 Subject: block: remove the error argument to the block_bio_complete tracepoint The status can be trivially derived from the bio itself. That also avoid callers like NVMe to incorrectly pass a blk_status_t instead of the errno, and the overhead of translating the blk_status_t to the errno in the I/O completion fast path when no tracing is enabled. Fixes: 35fe0d12c8a3 ("nvme: trace bio completion") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ea47f2084087..1e5499414cdf 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -885,10 +885,10 @@ static void blk_add_trace_bio_bounce(void *ignore, } static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio, - int error) + struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, + blk_status_to_errno(bio->bi_status)); } static void blk_add_trace_bio_backmerge(void *ignore, -- cgit v1.2.3 From 48bc3cd3e07a1486f45d9971c75d6090976c3b1b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 4 Jun 2020 00:13:28 -0700 Subject: blktrace: use errno instead of bi_status In blk_add_trace_spliti() blk_add_trace_bio_remap() use blk_status_to_errno() to pass the error instead of pasing the bi_status. This fixes the sparse warning. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1e5499414cdf..e2013bd0e2a6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -995,8 +995,10 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu, blk_trace_bio_get_cgid(q, bio)); + BLK_TA_SPLIT, + blk_status_to_errno(bio->bi_status), + sizeof(rpdu), &rpdu, + blk_trace_bio_get_cgid(q, bio)); } rcu_read_unlock(); } @@ -1033,7 +1035,8 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, + blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } -- cgit v1.2.3 From 71df3fd82e7cccec7b749a8607a4662d9f7febdd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 4 Jun 2020 00:13:29 -0700 Subject: blktrace: fix endianness in get_pdu_int() In function get_pdu_len() replace variable type from __u64 to __be64. This fixes sparse warning. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e2013bd0e2a6..f857b4684beb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1256,7 +1256,7 @@ static inline __u16 t_error(const struct trace_entry *ent) static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent, has_cg); + const __be64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } -- cgit v1.2.3 From 5aec598c456fe3c1b71a1202cbb42bdc2a643277 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 4 Jun 2020 00:13:30 -0700 Subject: blktrace: fix endianness for blk_log_remap() The function blk_log_remap() can be simplified by removing the call to get_pdu_remap() that copies the values into extra variable to print the data, which also fixes the endiannness warning reported by sparse. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f857b4684beb..5773f0ba7e76 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1260,17 +1260,6 @@ static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) return be64_to_cpu(*val); } -static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r, bool has_cg) -{ - const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); - __u64 sector_from = __r->sector_from; - - r->device_from = be32_to_cpu(__r->device_from); - r->device_to = be32_to_cpu(__r->device_to); - r->sector_from = be64_to_cpu(sector_from); -} - typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, bool has_cg); @@ -1410,13 +1399,13 @@ static void blk_log_with_error(struct trace_seq *s, static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { - struct blk_io_trace_remap r = { .device_from = 0, }; + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); - get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), - MAJOR(r.device_from), MINOR(r.device_from), - (unsigned long long)r.sector_from); + MAJOR(be32_to_cpu(__r->device_from)), + MINOR(be32_to_cpu(__r->device_from)), + be64_to_cpu(__r->sector_from)); } static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) -- cgit v1.2.3 From afd8d7c7f93681370f6fc2ec62f2705710eee62d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 31 May 2020 23:00:59 +0200 Subject: PM: hibernate: Add __init annotation to swsusp_header_init() 'swsusp_header_init()' is only called via 'core_initcall'. It can be marked as __init to save a few bytes of memory. Signed-off-by: Christophe JAILLET [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ca0fcb5ced71..01e2858b5fe3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1590,7 +1590,7 @@ int swsusp_unmark(void) } #endif -static int swsusp_header_init(void) +static int __init swsusp_header_init(void) { swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) -- cgit v1.2.3 From 388d8bdb87e01bcea6d0b2bf797b5f6d7b2401fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 9 Apr 2020 11:40:34 +0100 Subject: tracing: Remove obsolete PREEMPTIRQ_EVENTS kconfig option The PREEMPTIRQ_EVENTS option is unused after commit c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage"). Remove it. Note that this option is hazardous as it stands. It enables TRACE_IRQFLAGS event on non-preempt configurations without the irqsoff tracer enabled. TRACE_IRQFLAGS as it stands incurs significant overhead on each IRQ entry/exit. This is because trace_hardirqs_[on|off] does all the per-cpu manipulations and NMI checks even if tracing is completely disabled for some insane reason. For example, netperf running UDP_STREAM on localhost incurs a 4-6% performance penalty without any tracing if IRQFLAGS is set. It can be put behind a static brach but even the function entry/exit costs a little bit. Link: https://lkml.kernel.org/r/20200409104034.GJ3818@techsingularity.net Signed-off-by: Mel Gorman Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 75407d5dc83a..0c82ac2c5688 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -258,15 +258,6 @@ config TRACE_PREEMPT_TOGGLE Enables hooks which will be called when preemption is first disabled, and last enabled. -config PREEMPTIRQ_EVENTS - bool "Enable trace events for preempt and irq disable/enable" - select TRACE_IRQFLAGS - select TRACE_PREEMPT_TOGGLE if PREEMPTION - select GENERIC_TRACER - default n - help - Enable tracing of disable and enable events for preemption and irqs. - config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n -- cgit v1.2.3 From 8dfb61dcbaceb19a5ded5e9c9dcf8d05acc32294 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 5 Jun 2020 10:39:55 +0300 Subject: kbuild: add variables for compression tools Allow user to use alternative implementations of compression tools, such as pigz, pbzip2, pxz. For example, multi-threaded tools to speed up the build: $ make GZIP=pigz BZIP2=pbzip2 Variables _GZIP, _BZIP2, _LZOP are used internally because original env vars are reserved by the tools. The use of GZIP in gzip tool is obsolete since 2015. However, alternative implementations (e.g., pigz) still rely on it. BZIP2, BZIP, LZOP vars are not obsolescent. The credit goes to @grsecurity. As a sidenote, for multi-threaded lzma, xz compression one can use: $ export XZ_OPT="--threads=0" Signed-off-by: Denis Efremov Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index e13ca842eb7e..c1510f0ab3ea 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -88,7 +88,7 @@ find $cpio_dir -type f -print0 | find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ --owner=0 --group=0 --numeric-owner --no-recursion \ - -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null + -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 -- cgit v1.2.3 From 7ff0d4490ebadae8037a079a70c5cac13385a808 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Jun 2020 07:52:37 +0200 Subject: trace: fix an incorrect __user annotation on stack_trace_sysctl No user pointers for sysctls anymore. Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Reported-by: build test robot Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/trace/trace_stack.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c557f42a9397..98bba4764c52 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -515,9 +515,8 @@ static const struct file_operations stack_trace_filter_fops = { #endif /* CONFIG_DYNAMIC_FTRACE */ int -stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int was_enabled; int ret; -- cgit v1.2.3 From db38d5c106dfdd7cb7207c83267d82fdf4950b61 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:17 -0700 Subject: kernel: add panic_on_taint Analogously to the introduction of panic_on_warn, this patch introduces a kernel option named panic_on_taint in order to provide a simple and generic way to stop execution and catch a coredump when the kernel gets tainted by any given flag. This is useful for debugging sessions as it avoids having to rebuild the kernel to explicitly add calls to panic() into the code sites that introduce the taint flags of interest. For instance, if one is interested in proceeding with a post-mortem analysis at the point a given code path is hitting a bad page (i.e. unaccount_page_cache_page(), or slab_bug()), a coredump can be collected by rebooting the kernel with 'panic_on_taint=0x20' amended to the command line. Another, perhaps less frequent, use for this option would be as a means for assuring a security policy case where only a subset of taints, or no single taint (in paranoid mode), is allowed for the running system. The optional switch 'nousertaint' is handy in this particular scenario, as it will avoid userspace induced crashes by writes to sysctl interface /proc/sys/kernel/tainted causing false positive hits for such policies. [akpm@linux-foundation.org: tweak kernel-parameters.txt wording] Suggested-by: Qian Cai Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Dave Young Cc: Baoquan He Cc: Jonathan Corbet Cc: Kees Cook Cc: Randy Dunlap Cc: "Theodore Ts'o" Cc: Adrian Bunk Cc: Greg Kroah-Hartman Cc: Laura Abbott Cc: Jeff Mahoney Cc: Jiri Kosina Cc: Takashi Iwai Link: http://lkml.kernel.org/r/20200515175502.146720-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- kernel/panic.c | 34 ++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 11 ++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b69ee9e76cb2..94b5c973770c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -44,6 +44,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +unsigned long panic_on_taint; +bool panic_on_taint_nousertaint = false; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -434,6 +436,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); + + if (tainted_mask & panic_on_taint) { + panic_on_taint = 0; + panic("panic_on_taint set ..."); + } } EXPORT_SYMBOL(add_taint); @@ -686,3 +693,30 @@ static int __init oops_setup(char *s) return 0; } early_param("oops", oops_setup); + +static int __init panic_on_taint_setup(char *s) +{ + char *taint_str; + + if (!s) + return -EINVAL; + + taint_str = strsep(&s, ","); + if (kstrtoul(taint_str, 16, &panic_on_taint)) + return -EINVAL; + + /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ + panic_on_taint &= TAINT_FLAGS_MAX; + + if (!panic_on_taint) + return -EINVAL; + + if (s && !strcmp(s, "nousertaint")) + panic_on_taint_nousertaint = true; + + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", + panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + + return 0; +} +early_param("panic_on_taint", panic_on_taint_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 715774d8c55f..587ed0494f2f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -866,11 +866,20 @@ static int proc_taint(struct ctl_table *table, int write, return err; if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + /* * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) add_taint(i, LOCKDEP_STILL_OK); -- cgit v1.2.3 From b467f3ef3c50c4fa8926ca07f7db9a33a645e13a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:31 -0700 Subject: kernel/hung_task convert hung_task_panic boot parameter to sysctl We can now handle sysctl parameters on kernel command line and have infrastructure to convert legacy command line options that duplicate sysctl to become a sysctl alias. This patch converts the hung_task_panic parameter. Note that the sysctl handler is more strict and allows only 0 and 1, while the legacy parameter allowed any non-zero value. But there is little reason anyone would not be using 1. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-4-vbabka@suse.cz Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c16cb3..b22b5eeab3cb 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,16 +63,6 @@ static struct task_struct *watchdog_task; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int __init hung_task_panic_setup(char *str) -{ - int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); - - if (rc) - return rc; - return 1; -} -__setup("hung_task_panic=", hung_task_panic_setup); - static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { -- cgit v1.2.3 From f117955a2255721a6a0e9cecf6cad3a6eb43cbc3 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:42 -0700 Subject: kernel/watchdog.c: convert {soft/hard}lockup boot parameters to sysctl aliases After a recent change introduced by Vlastimil's series [0], kernel is able now to handle sysctl parameters on kernel command line; also, the series introduced a simple infrastructure to convert legacy boot parameters (that duplicate sysctls) into sysctl aliases. This patch converts the watchdog parameters softlockup_panic and {hard,soft}lockup_all_cpu_backtrace to use the new alias infrastructure. It fixes the documentation too, since the alias only accepts values 0 or 1, not the full range of integers. We also took the opportunity here to improve the documentation of the previously converted hung_task_panic (see the patch series [0]) and put the alias table in alphabetical order. [0] http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Link: http://lkml.kernel.org/r/20200507214624.21911-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 53ff2c81b084..5abb5b22ad13 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -50,6 +50,11 @@ struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR + +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; +# endif /* CONFIG_SMP */ + /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -82,16 +87,6 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -# ifdef CONFIG_SMP -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; - -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -# endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* @@ -163,6 +158,10 @@ static void lockup_detector_update_enable(void) #define SOFTLOCKUP_RESET ULONG_MAX +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#endif + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -178,13 +177,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; @@ -206,17 +198,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; - -static int __init softlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#endif - static void __lockup_detector_cleanup(void); /* -- cgit v1.2.3 From 0ec9dc9bcba0a62b0844e54c1caf6b8b0bf6b5b4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:45 -0700 Subject: kernel/hung_task.c: introduce sysctl to print all traces when a hung task is detected Commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks before panic") introduced a change in that we started to show all CPUs backtraces when a hung task is detected _and_ the sysctl/kernel parameter "hung_task_panic" is set. The idea is good, because usually when observing deadlocks (that may lead to hung tasks), the culprit is another task holding a lock and not necessarily the task detected as hung. The problem with this approach is that dumping backtraces is a slightly expensive task, specially printing that on console (and specially in many CPU machines, as servers commonly found nowadays). So, users that plan to collect a kdump to investigate the hung tasks and narrow down the deadlock definitely don't need the CPUs backtrace on dmesg/console, which will delay the panic and pollute the log (crash tool would easily grab all CPUs traces with 'bt -a' command). Also, there's the reciprocal scenario: some users may be interested in seeing the CPUs backtraces but not have the system panic when a hung task is detected. The current approach hence is almost as embedding a policy in the kernel, by forcing the CPUs backtraces' dump (only) on hung_task_panic. This patch decouples the panic event on hung task from the CPUs backtraces dump, by creating (and documenting) a new sysctl called "hung_task_all_cpu_backtrace", analog to the approach taken on soft/hard lockups, that have both a panic and an "all_cpu_backtrace" sysctl to allow individual control. The new mechanism for dumping the CPUs backtraces on hung task detection respects "hung_task_warnings" by not dumping the traces in case there's no warnings left. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Tetsuo Handa Link: http://lkml.kernel.org/r/20200327223646.20779-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 20 ++++++++++++++++++-- kernel/sysctl.c | 11 +++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b22b5eeab3cb..ce76f490126c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; +static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in a hung task event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: @@ -127,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; + + if (sysctl_hung_task_all_cpu_backtrace) + hung_task_show_all_bt = true; } touch_nmi_watchdog(); @@ -191,10 +203,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); - if (hung_task_call_panic) { + + if (hung_task_show_all_bt) { + hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); } + + if (hung_task_call_panic) + panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 587ed0494f2f..34c1278951b9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2437,6 +2437,17 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_DETECT_HUNG_TASK +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, -- cgit v1.2.3 From 60c958d8df9cfc40b745d6cd583cfbfa7525ead6 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:48 -0700 Subject: panic: add sysctl to dump all CPUs backtraces on oops event Usually when the kernel reaches an oops condition, it's a point of no return; in case not enough debug information is available in the kernel splat, one of the last resorts would be to collect a kernel crash dump and analyze it. The problem with this approach is that in order to collect the dump, a panic is required (to kexec-load the crash kernel). When in an environment of multiple virtual machines, users may prefer to try living with the oops, at least until being able to properly shutdown their VMs / finish their important tasks. This patch implements a way to collect a bit more debug details when an oops event is reached, by printing all the CPUs backtraces through the usage of NMIs (on architectures that support that). The sysctl added (and documented) here was called "oops_all_cpu_backtrace", and when set will (as the name suggests) dump all CPUs backtraces. Far from ideal, this may be the last option though for users that for some reason cannot panic on oops. Most of times oopses are clear enough to indicate the kernel portion that must be investigated, but in virtual environments it's possible to observe hypervisor/KVM issues that could lead to oopses shown in other guests CPUs (like virtual APIC crashes). This patch hence aims to help debug such complex issues without resorting to kdump. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Iurii Zaikin Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Randy Dunlap Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200327224116.21030-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- kernel/panic.c | 11 +++++++++++ kernel/sysctl.c | 11 +++++++++++ 2 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 94b5c973770c..85568bbfb12b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,14 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in an oops event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; @@ -522,6 +530,9 @@ void oops_enter(void) /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); + + if (sysctl_oops_all_cpu_backtrace) + trigger_all_cpu_backtrace(); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34c1278951b9..f69d581d39c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2150,6 +2150,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SMP + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, -- cgit v1.2.3 From e77132e75845470065768e2205727e6be52cb7f4 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:51 -0700 Subject: kernel/sysctl.c: ignore out-of-range taint bits introduced via kernel.tainted Users with SYS_ADMIN capability can add arbitrary taint flags to the running kernel by writing to /proc/sys/kernel/tainted or issuing the command 'sysctl -w kernel.tainted=...'. This interface, however, is open for any integer value and this might cause an invalid set of flags being committed to the tainted_mask bitset. This patch introduces a simple way for proc_taint() to ignore any eventual invalid bit coming from the user input before committing those bits to the kernel tainted_mask. Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: "Theodore Ts'o" Link: http://lkml.kernel.org/r/20200512223946.888020-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f69d581d39c3..db1ce7af2563 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -880,10 +880,9 @@ static int proc_taint(struct ctl_table *table, int write, * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) add_taint(i, LOCKDEP_STILL_OK); - } } return err; -- cgit v1.2.3 From dadbb612f6e50bbf9101c2f5d82690ce9ea4d66b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sun, 7 Jun 2020 21:40:55 -0700 Subject: mm/gup.c: convert to use get_user_{page|pages}_fast_only() API __get_user_pages_fast() renamed to get_user_pages_fast_only() to align with pin_user_pages_fast_only(). As part of this we will get rid of write parameter. Instead caller will pass FOLL_WRITE to get_user_pages_fast_only(). This will not change any existing functionality of the API. All the callers are changed to pass FOLL_WRITE. Also introduce get_user_page_fast_only(), and use it in a few places that hard-code nr_pages to 1. Updated the documentation of the API. Signed-off-by: Souptick Joarder Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Paul Mackerras [arch/powerpc/kvm] Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Stephen Rothwell Cc: Mike Rapoport Cc: Aneesh Kumar K.V Cc: Michal Suchanek Link: http://lkml.kernel.org/r/1590396812-31277-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Linus Torvalds --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fcfadecd3a08..63d66bbebbd5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6934,12 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. - * Try IRQ-safe __get_user_pages_fast first. + * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (get_user_page_fast_only(virt, 0, &p)) phys_addr = page_to_phys(p) + virt % PAGE_SIZE; pagefault_enable(); } -- cgit v1.2.3 From 885f7f8e30468705926b3de53d32925bd7a51a03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:22 -0700 Subject: mm: rename flush_icache_user_range to flush_icache_user_page The function currently known as flush_icache_user_range only operates on a single page. Rename it to flush_icache_user_page as we'll need the name flush_icache_user_range for something else soon. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Geert Uytterhoeven Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Vincent Chen Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20200515143646.3857579-20-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eddc8db96027..e51ec844c87c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1668,7 +1668,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, copy_to_page(page, vaddr, src, len); /* - * We probably need flush_icache_user_range() but it needs vma. + * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. -- cgit v1.2.3 From 490741ab1beb9c9f6580e30c7cd91b17a7560369 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:52 -0700 Subject: module: move the set_fs hack for flush_icache_range to m68k flush_icache_range generally operates on kernel addresses, but for some reason m68k needed a set_fs override. Move that into the m68k code insted of keeping it in the module loader. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Jessica Yu Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Link: http://lkml.kernel.org/r/20200515143646.3857579-30-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/module.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ef400c389f49..e8a198588f26 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3344,12 +3344,6 @@ static int check_module_license_and_versions(struct module *mod) static void flush_module_icache(const struct module *mod) { - mm_segment_t old_fs; - - /* flush the icache in correct context */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* * Flush the instruction cache, since we've played with text. * Do it before processing of module parameters, so the module @@ -3361,8 +3355,6 @@ static void flush_module_icache(const struct module *mod) + mod->init_layout.size); flush_icache_range((unsigned long)mod->core_layout.base, (unsigned long)mod->core_layout.base + mod->core_layout.size); - - set_fs(old_fs); } int __weak module_frob_arch_sections(Elf_Ehdr *hdr, -- cgit v1.2.3 From c7f3d43b629b598a2bb9ec3524e844eae7492e7e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Jun 2020 23:51:15 +0200 Subject: clocksource: Remove obsolete ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE was a transitional config switch which got removed after all architectures got converted to the new storage model. But the removal forgot to remove the #ifdef which guards the vdso_clock_mode sanity check, which effectively disables the sanity check. Remove it now. Fixes: f86fd32db706 ("lib/vdso: Cleanup clock mode storage leftovers") Signed-off-by: Thomas Gleixner Tested-by: Miklos Szeredi Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200606221531.845475036@linutronix.de --- kernel/time/clocksource.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7cb09c4cf21c..02441ead3c3b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -928,14 +928,12 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_arch_init(cs); -#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE if (cs->vdso_clock_mode < 0 || cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", cs->name, cs->vdso_clock_mode); cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; } -#endif /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); -- cgit v1.2.3 From 2062a4e8ae9f486847652927aaf88e21ab8d195d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:29:56 -0700 Subject: kallsyms/printk: add loglvl to print_ip_sym() Patch series "Add log level to show_stack()", v3. Add log level argument to show_stack(). Done in three stages: 1. Introducing show_stack_loglvl() for every architecture 2. Migrating old users with an explicit log level 3. Renaming show_stack_loglvl() into show_stack() Justification: - It's a design mistake to move a business-logic decision into platform realization detail. - I have currently two patches sets that would benefit from this work: Removing console_loglevel jumps in sysrq driver [1] Hung task warning before panic [2] - suggested by Tetsuo (but he probably didn't realise what it would involve). - While doing (1), (2) the backtraces were adjusted to headers and other messages for each situation - so there won't be a situation when the backtrace is printed, but the headers are missing because they have lesser log level (or the reverse). - As the result in (2) plays with console_loglevel for kdb are removed. The least important for upstream, but maybe still worth to note that every company I've worked in so far had an off-list patch to print backtrace with the needed log level (but only for the architecture they cared about). If you have other ideas how you will benefit from show_stack() with a log level - please, reply to this cover letter. See also discussion on v1: https://lore.kernel.org/linux-riscv/20191106083538.z5nlpuf64cigxigh@pathway.suse.cz/ This patch (of 50): print_ip_sym() needs to have a log level parameter to comply with other parts being printed. Otherwise, half of the expected backtrace would be printed and other may be missing with some logging level. The following callee(s) are using now the adjusted log level: - microblaze/unwind: the same level as headers & userspace unwind. Note that pr_debug()'s there are for debugging the unwinder itself. - nds32/traps: symbol addresses are printed with the same log level as backtrace headers. - lockdep: ip for locking issues is printed with the same log level as other part of the warning. - sched: ip where preemption was disabled is printed as error like the rest part of the message. - ftrace: bug reports are now consistent in the log level being used. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Acked-by: Steven Rostedt (VMware) Cc: Albert Ou Cc: Ben Segall Cc: Dietmar Eggemann Cc: Greentime Hu Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Hogan Cc: Juri Lelli Cc: Mel Gorman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Burton Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vincent Chen Cc: Vincent Guittot Cc: Will Deacon Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Dmitry Safonov Cc: Jiri Slaby Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Aurelien Jacquiot Cc: Mark Salter Cc: Guo Ren Cc: Yoshinori Sato Cc: Brian Cain Cc: Fenghua Yu Cc: Tony Luck Cc: Geert Uytterhoeven Cc: Ley Foon Tan Cc: Jonas Bonn Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Rich Felker Cc: "David S. Miller" Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Cc: Guan Xuetao Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Chris Zankel Cc: Max Filippov Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Rafael J. Wysocki" Cc: Daniel Thompson Cc: Douglas Anderson Cc: Jason Wessel Link: http://lkml.kernel.org/r/20200418201944.482088-2-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/locking/lockdep.c | 4 ++-- kernel/sched/core.c | 6 ++---- kernel/trace/ftrace.c | 8 ++++---- 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4c057dd8e93b..38cce34d03dc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4424,7 +4424,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no more locks to release!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -5075,7 +5075,7 @@ static void print_lock_contention_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no locks held!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8298b2c240ce..c06da3c3e317 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3922,8 +3922,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } if (panic_on_warn) panic("scheduling while atomic\n"); @@ -6871,8 +6870,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b5765aeea698..7d0ebd104706 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2020,12 +2020,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EFAULT: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; case -EINVAL: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); if (ftrace_expected) { @@ -2036,12 +2036,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EPERM: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; default: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); } print_bug_type(); if (rec) { -- cgit v1.2.3 From 77819daf247aad16beaeb537ae77d1d6d0697ca2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:19 -0700 Subject: kdb: don't play with console_loglevel Print the stack trace with KERN_EMERG - it should be always visible. Playing with console_loglevel is a bad idea as there may be more messages printed than wanted. Also the stack trace might be not printed at all if printk() was deferred and console_loglevel was raised back before the trace got flushed. Unfortunately, after rebasing on commit 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master"), kdb_show_stack() uses now kdb_dump_stack_on_cpu(), which for now won't be converted as it uses dump_stack() instead of show_stack(). Convert for now the branch that uses show_stack() and remove console_loglevel exercise from that case. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Reviewed-by: Douglas Anderson Acked-by: Daniel Thompson Cc: Jason Wessel Link: http://lkml.kernel.org/r/20200418201944.482088-48-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_bt.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 3de0cc780c16..43f5dcd2b9ac 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,17 +21,18 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { - int old_lvl = console_loglevel; - - console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - if (!addr && kdb_task_has_cpu(p)) + if (!addr && kdb_task_has_cpu(p)) { + int old_lvl = console_loglevel; + + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_dump_stack_on_cpu(kdb_process_cpu(p)); - else - show_stack(p, addr); + console_loglevel = old_lvl; + } else { + show_stack_loglvl(p, addr, KERN_EMERG); + } - console_loglevel = old_lvl; kdb_trap_printk--; } -- cgit v1.2.3 From 8ba09b1dc131ff9bb530967c593e298c600f72c0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:23 -0700 Subject: sched: print stack trace with KERN_INFO Aligning with other messages printed in sched_show_task() - use KERN_INFO to print the backtrace. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Ben Segall Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20200418201944.482088-49-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c06da3c3e317..c68a6e7b306f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + show_stack_loglvl(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); -- cgit v1.2.3 From fe1993a001094a596576334c56e7a10e4cd69e6c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:26 -0700 Subject: kernel: use show_stack_loglvl() Align the last users of show_stack() by KERN_DEFAULT as the surrounding headers/messages. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-50-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/locking/rtmutex-debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index fd4fe1f5b458..5e63d6e8a223 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack(task, NULL); + show_stack_loglvl(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); -- cgit v1.2.3 From 9cb8f069deeed708bf19486d5893e297dc467ae0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:29 -0700 Subject: kernel: rename show_stack_loglvl() => show_stack() Now the last users of show_stack() got converted to use an explicit log level, show_stack_loglvl() can drop it's redundant suffix and become once again well known show_stack(). Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200418201944.482088-51-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_bt.c | 2 +- kernel/locking/rtmutex-debug.c | 2 +- kernel/sched/core.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 43f5dcd2b9ac..18e03aba2cfc 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -30,7 +30,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) kdb_dump_stack_on_cpu(kdb_process_cpu(p)); console_loglevel = old_lvl; } else { - show_stack_loglvl(p, addr, KERN_EMERG); + show_stack(p, addr, KERN_EMERG); } kdb_trap_printk--; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 5e63d6e8a223..36e69100e8e0 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack_loglvl(task, NULL, KERN_DEFAULT); + show_stack(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c68a6e7b306f..8f360326861e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack_loglvl(p, NULL, KERN_INFO); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); -- cgit v1.2.3 From e31cf2f4ca422ac9b14ecc4a1295b8977a20f812 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:33 -0700 Subject: mm: don't include asm/pgtable.h if linux/mm.h is already included Patch series "mm: consolidate definitions of page table accessors", v2. The low level page table accessors (pXY_index(), pXY_offset()) are duplicated across all architectures and sometimes more than once. For instance, we have 31 definition of pgd_offset() for 25 supported architectures. Most of these definitions are actually identical and typically it boils down to, e.g. static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } These definitions can be shared among 90% of the arches provided XYZ_SHIFT, PTRS_PER_XYZ and xyz_page_vaddr() are defined. For architectures that really need a custom version there is always possibility to override the generic version with the usual ifdefs magic. These patches introduce include/linux/pgtable.h that replaces include/asm-generic/pgtable.h and add the definitions of the page table accessors to the new header. This patch (of 12): The linux/mm.h header includes to allow inlining of the functions involving page table manipulations, e.g. pte_alloc() and pmd_alloc(). So, there is no point to explicitly include in the files that include . The include statements in such cases are remove with a simple loop: for f in $(git grep -l "include ") ; do sed -i -e '/include / d' $f done Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200514170327.31389-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/power/snapshot.c | 1 - 3 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c300253a7b8e..1aecef938822 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -66,7 +66,6 @@ #include #include -#include #include static void __unhash_process(struct task_struct *p, bool group_dead) diff --git a/kernel/fork.c b/kernel/fork.c index cefe8745c46e..3603e14474cd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,7 +96,6 @@ #include #include -#include #include #include #include diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 659800157b17..881128b9351e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -34,7 +34,6 @@ #include #include -#include #include #include -- cgit v1.2.3 From ca5999fde0a1761665a38e4c9a72dbcd7d190a81 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:38 -0700 Subject: mm: introduce include/linux/pgtable.h The include/linux/pgtable.h is going to be the home of generic page table manipulation functions. Start with moving asm-generic/pgtable.h to include/linux/pgtable.h and make the latter include asm/pgtable.h. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-3-rppt@kernel.org Signed-off-by: Linus Torvalds --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d530b1d5683..c3ae2adaeccd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From d8ed45c5dcd455fc5848d47f86883a1b872ac0d0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:25 -0700 Subject: mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds --- kernel/acct.c | 4 ++-- kernel/bpf/stackmap.c | 4 ++-- kernel/events/core.c | 4 ++-- kernel/events/uprobes.c | 16 ++++++++-------- kernel/exit.c | 8 ++++---- kernel/fork.c | 12 ++++++------ kernel/futex.c | 4 ++-- kernel/sched/fair.c | 4 ++-- kernel/sys.c | 18 +++++++++--------- kernel/trace/trace_output.c | 4 ++-- 10 files changed, 39 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 11ff4a596d6b..c530568dd51c 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -541,13 +541,13 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = current->mm->mmap; while (vma) { vsize += vma->vm_end - vma->vm_start; vma = vma->vm_next; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); } spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 7b8381ce40a0..a13b7e28eaf8 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -317,7 +317,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - down_read_trylock(¤t->mm->mmap_sem) == 0) { + mmap_read_trylock(current->mm) == 0) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -342,7 +342,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); } else { work->sem = ¤t->mm->mmap_sem; irq_work_queue(&work->irq_work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 63d66bbebbd5..2861addad657 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9784,7 +9784,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (!mm) goto restart; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); @@ -9810,7 +9810,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e51ec844c87c..cd82e1ba6b9b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1058,7 +1058,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) @@ -1080,7 +1080,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) } unlock: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); @@ -1235,7 +1235,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long vaddr; loff_t offset; @@ -1252,7 +1252,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, mm, vaddr); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; } @@ -1439,7 +1439,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { @@ -1469,7 +1469,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } @@ -2039,7 +2039,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct uprobe *uprobe = NULL; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { @@ -2057,7 +2057,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) mmf_recalc_uprobes(mm); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return uprobe; } diff --git a/kernel/exit.c b/kernel/exit.c index 1aecef938822..36cbaa43ac80 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -445,12 +445,12 @@ static void exit_mm(void) * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); core_state = mm->core_state; if (core_state) { struct core_thread self; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); self.task = current; self.next = xchg(&core_state->dumper.next, &self); @@ -468,14 +468,14 @@ static void exit_mm(void) freezable_schedule(); } __set_current_state(TASK_RUNNING); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); current->mm = NULL; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); enter_lazy_tlb(mm, current); task_unlock(current); mm_update_next_owner(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 3603e14474cd..d61751ba10dc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -492,7 +492,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, LIST_HEAD(uf); uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { + if (mmap_write_lock_killable(oldmm)) { retval = -EINTR; goto fail_uprobe_end; } @@ -617,9 +617,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); + mmap_write_unlock(oldmm); dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); @@ -649,9 +649,9 @@ static inline void mm_free_pgd(struct mm_struct *mm) #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - down_write(&oldmm->mmap_sem); + mmap_write_lock(oldmm); RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); + mmap_write_unlock(oldmm); return 0; } #define mm_alloc_pgd(mm) (0) @@ -1022,7 +1022,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); - init_rwsem(&mm->mmap_sem); + mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; mm_pgtables_bytes_init(mm); diff --git a/kernel/futex.c b/kernel/futex.c index b4b9f960b610..e646661f6282 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -698,10 +698,10 @@ static int fault_in_user_writeable(u32 __user *uaddr) struct mm_struct *mm = current->mm; int ret; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return ret < 0 ? ret : 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35f4cc024dcf..cbcb2f71599b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2770,7 +2770,7 @@ static void task_numa_work(struct callback_head *work) return; - if (!down_read_trylock(&mm->mmap_sem)) + if (!mmap_read_trylock(mm)) return; vma = find_vma(mm, start); if (!vma) { @@ -2838,7 +2838,7 @@ out: mm->numa_scan_offset = start; else reset_ptenuma_scan(p); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Make sure tasks use at least 32x as much time to run other code diff --git a/kernel/sys.c b/kernel/sys.c index 891667a49bb7..12805750a66c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1846,7 +1846,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (exe_file) { struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!vma->vm_file) continue; @@ -1855,7 +1855,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) goto exit_err; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); fput(exe_file); } @@ -1869,7 +1869,7 @@ exit: fdput(exe); return err; exit_err: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); fput(exe_file); goto exit; } @@ -2010,7 +2010,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * arg_lock protects concurent updates but we still need mmap_sem for * read to exclude races with sys_brk. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); /* * We don't validate if these members are pointing to @@ -2049,7 +2049,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -2125,7 +2125,7 @@ static int prctl_set_mm(int opt, unsigned long addr, * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr * validation. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, addr); spin_lock(&mm->arg_lock); @@ -2217,7 +2217,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: spin_unlock(&mm->arg_lock); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return error; } @@ -2442,13 +2442,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; - if (down_write_killable(&me->mm->mmap_sem)) + if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) set_bit(MMF_DISABLE_THP, &me->mm->flags); else clear_bit(MMF_DISABLE_THP, &me->mm->flags); - up_write(&me->mm->mmap_sem); + mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 9a121e147102..73976de7f8cc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -393,7 +393,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, if (mm) { const struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, ip); if (vma) { file = vma->vm_file; @@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, trace_seq_printf(s, "[+0x%lx]", ip - vmstart); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) trace_seq_printf(s, " <" IP_FMT ">", ip); -- cgit v1.2.3 From aaa2cc56c1cd757efec88a4978ffce4cbf884352 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:33 -0700 Subject: mmap locking API: convert nested write lock sites Add API for nested write locks and convert the few call sites doing that. Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-7-walken@google.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d61751ba10dc..142b23645d82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -501,7 +501,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* * Not linked in yet - no deadlock potential: */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); -- cgit v1.2.3 From 0cc55a0213a02b760ade1d4755fdccfbf7d3157e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:37 -0700 Subject: mmap locking API: add mmap_read_trylock_non_owner() Add a couple APIs used by kernel/bpf/stackmap.c only: - mmap_read_trylock_non_owner() - mmap_read_unlock_non_owner() (may be called from a work queue). It's still not ideal that bpf/stackmap subverts the lock ownership in this way. Thanks to Peter Zijlstra for suggesting this API as the least-ugly way of addressing this in the short term. Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Laurent Dufour Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-8-walken@google.com Signed-off-by: Linus Torvalds --- kernel/bpf/stackmap.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a13b7e28eaf8..599488f25e40 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -33,7 +33,7 @@ struct bpf_stack_map { /* irq_work to run up_read() for build_id lookup in nmi context */ struct stack_map_irq_work { struct irq_work irq_work; - struct rw_semaphore *sem; + struct mm_struct *mm; }; static void do_up_read(struct irq_work *entry) @@ -44,8 +44,7 @@ static void do_up_read(struct irq_work *entry) return; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read_non_owner(work->sem); - work->sem = NULL; + mmap_read_unlock_non_owner(work->mm); } static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); @@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - mmap_read_trylock(current->mm) == 0) { + !mmap_read_trylock_non_owner(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock(current->mm); + mmap_read_unlock_non_owner(current->mm); } else { - work->sem = ¤t->mm->mmap_sem; + work->mm = current->mm; irq_work_queue(&work->irq_work); - /* - * The irq_work will release the mmap_sem with - * up_read_non_owner(). The rwsem_release() is called - * here to release the lock from lockdep's perspective. - */ - rwsem_release(¤t->mm->mmap_sem.dep_map, _RET_IP_); } } -- cgit v1.2.3 From c1e8d7c6a7a682e1405e3e242d32fc377fd196ff Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:54 -0700 Subject: mmap locking API: convert mmap_sem comments Convert comments that reference mmap_sem to reference mmap_lock instead. [akpm@linux-foundation.org: fix up linux-next leftovers] [akpm@linux-foundation.org: s/lockaphore/lock/, per Vlastimil] [akpm@linux-foundation.org: more linux-next fixups, per Michel] Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Vlastimil Babka Reviewed-by: Daniel Jordan Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Laurent Dufour Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-13-walken@google.com Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- kernel/cgroup/cpuset.c | 4 ++-- kernel/events/core.c | 6 +++--- kernel/events/uprobes.c | 4 ++-- kernel/exit.c | 2 +- kernel/relay.c | 2 +- kernel/sys.c | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index c530568dd51c..b0c5b3a9f5af 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -40,7 +40,7 @@ * is one more bug... 10/11/98, AV. * * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold - * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks + * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks * a struct file opened for write. Fixed. 2/6/2000, AV. */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 729d3a5c772e..642415b8c3c9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1655,7 +1655,7 @@ static void update_tasks_nodemask(struct cpuset *cs) guarantee_online_mems(cs, &newmems); /* - * The mpol_rebind_mm() call takes mmap_sem, which we couldn't + * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold @@ -1760,7 +1760,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, diff --git a/kernel/events/core.c b/kernel/events/core.c index 2861addad657..856d98c36f56 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1316,7 +1316,7 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event::child_mutex; * perf_event_context::lock * perf_event::mmap_mutex - * mmap_sem + * mmap_lock * perf_addr_filters_head::lock * * cpu_hotplug_lock @@ -3080,7 +3080,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly - * registered mapping, called for every new mmap(), with mm::mmap_sem down + * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. @@ -9742,7 +9742,7 @@ static void perf_addr_filters_splice(struct perf_event *event, /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. - * Called with mm::mmap_sem down for reading. + * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cd82e1ba6b9b..4f4993dcf17a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -457,7 +457,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held for write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@ -1349,7 +1349,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. + * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. diff --git a/kernel/exit.c b/kernel/exit.c index 36cbaa43ac80..727150f28103 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -440,7 +440,7 @@ static void exit_mm(void) sync_mm_rss(mm); /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state + * We must hold mmap_lock around checking core_state * and clearing tsk->mm. The core-inducing thread * will increment ->nr_threads for each thread in the * group with ->mm != NULL. diff --git a/kernel/relay.c b/kernel/relay.c index 204867220f8a..72fe443ea78f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static void relay_free_page_array(struct page **array) * * Returns 0 if ok, negative on error * - * Caller should already have grabbed mmap_sem. + * Caller should already have grabbed mmap_lock. */ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) { diff --git a/kernel/sys.c b/kernel/sys.c index 12805750a66c..fd46865b46ba 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2007,7 +2007,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } /* - * arg_lock protects concurent updates but we still need mmap_sem for + * arg_lock protects concurent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ mmap_read_lock(mm); @@ -2122,7 +2122,7 @@ static int prctl_set_mm(int opt, unsigned long addr, /* * arg_lock protects concurent updates of arg boundaries, we need - * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr + * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ mmap_read_lock(mm); -- cgit v1.2.3 From bd88bb5d4007949be7154deae7cef7173c751a95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:14 -0700 Subject: maccess: rename strncpy_from_unsafe_user to strncpy_from_user_nofault This matches the naming of strncpy_from_user, and also makes it more clear what the function is supposed to do. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-7-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3744372a24e2..334db5a299a0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -159,7 +159,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto = { BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, const void __user *, unsafe_ptr) { - int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size); + int ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); @@ -419,7 +419,7 @@ fmt_str: sizeof(buf)); break; case 'u': - strncpy_from_unsafe_user(buf, + strncpy_from_user_nofault(buf, (__force void __user *)unsafe_ptr, sizeof(buf)); break; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35989383ae11..d600f41fda1c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1268,7 +1268,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); - ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen); + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); -- cgit v1.2.3 From c4cb164426aebd635baa53685b0ebf1a127e9803 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:17 -0700 Subject: maccess: rename strncpy_from_unsafe_strict to strncpy_from_kernel_nofault This matches the naming of strncpy_from_user_nofault, and also makes it more clear what the function is supposed to do. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-8-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 334db5a299a0..879ee676bf5a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -240,7 +240,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, * is returned that can be used for bpf_perf_event_output() et al. */ ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : - strncpy_from_unsafe_strict(dst, unsafe_ptr, size); + strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) out: memset(dst, 0, size); @@ -415,7 +415,7 @@ fmt_str: break; #endif case 'k': - strncpy_from_unsafe_strict(buf, unsafe_ptr, + strncpy_from_kernel_nofault(buf, unsafe_ptr, sizeof(buf)); break; case 'u': -- cgit v1.2.3 From 02dddb160ec1dccb51e75f3113654a090bc3963a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:20 -0700 Subject: maccess: rename strnlen_unsafe_user to strnlen_user_nofault This matches the naming of strnlen_user, and also makes it more clear what the function is supposed to do. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-9-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d600f41fda1c..4325f9e7fada 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1221,7 +1221,7 @@ fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; - return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE); + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); } /* -- cgit v1.2.3 From d7b2977b816223a884814eea46fbe38e192cec4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:30 -0700 Subject: bpf: factor out a bpf_trace_copy_string helper Split out a helper to do the fault free access to the string pointer to get it out of a crazy indentation level. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-12-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 879ee676bf5a..60e82c7b8122 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -324,6 +324,28 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } +static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + strncpy_from_unsafe(buf, unsafe_ptr, bufsz); + break; +#endif + case 'k': + strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + break; + case 'u': + strncpy_from_user_nofault(buf, user_ptr, bufsz); + break; + } +} + /* * Only limited trace_printk() conversion specifiers allowed: * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s @@ -406,24 +428,8 @@ fmt_str: break; } - buf[0] = 0; - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - strncpy_from_unsafe(buf, unsafe_ptr, - sizeof(buf)); - break; -#endif - case 'k': - strncpy_from_kernel_nofault(buf, unsafe_ptr, - sizeof(buf)); - break; - case 'u': - strncpy_from_user_nofault(buf, - (__force void __user *)unsafe_ptr, - sizeof(buf)); - break; - } + bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, + sizeof(buf)); goto fmt_next; } -- cgit v1.2.3 From aec6ce59133edc4ac04f7d4e2556fdf047becb62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:33 -0700 Subject: bpf: handle the compat string in bpf_trace_copy_string better User the proper helper for kernel or userspace addresses based on TASK_SIZE instead of the dangerous strncpy_from_unsafe function. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-13-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 60e82c7b8122..a2efbdad434b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -334,8 +334,11 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, switch (fmt_ptype) { case 's': #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - strncpy_from_unsafe(buf, unsafe_ptr, bufsz); - break; + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + strncpy_from_user_nofault(buf, user_ptr, bufsz); + break; + } + fallthrough; #endif case 'k': strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); -- cgit v1.2.3 From 19c8d8ac63d39578483db0b82107c172d8edfb07 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 8 Jun 2020 21:34:37 -0700 Subject: bpf:bpf_seq_printf(): handle potentially unsafe format string better User the proper helper for kernel or userspace addresses based on TASK_SIZE instead of the dangerous strncpy_from_unsafe function. Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a2efbdad434b..deefe2823be1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -588,15 +588,17 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, } if (fmt[i] == 's') { + void *unsafe_ptr; + /* try our best to copy */ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { err = -E2BIG; goto out; } - err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - MAX_SEQ_PRINTF_STR_LEN); + unsafe_ptr = (void *)(long)args[fmt_cnt]; + err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], + unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); if (err < 0) bufs->buf[memcpy_cnt][0] = '\0'; params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; -- cgit v1.2.3 From 8d92db5c04d10381f4db70ed99b1b576f5db18a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:40 -0700 Subject: bpf: rework the compat kernel probe handling Instead of using the dangerous probe_kernel_read and strncpy_from_unsafe helpers, rework the compat probes to check if an address is a kernel or userspace one, and then use the low-level kernel or user probe helper shared by the proper kernel and user probe helpers. This slightly changes behavior as the compat probe on a user address doesn't check the lockdown flags, just as the pure user probes do. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-14-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 109 +++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index deefe2823be1..59cb19888891 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -136,17 +136,23 @@ static const struct bpf_func_proto bpf_override_return_proto = { }; #endif -BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, - const void __user *, unsafe_ptr) +static __always_inline int +bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) { - int ret = probe_user_read(dst, unsafe_ptr, size); + int ret; + ret = probe_user_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - return ret; } +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_common(dst, size, unsafe_ptr); +} + const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, @@ -156,17 +162,24 @@ const struct bpf_func_proto bpf_probe_read_user_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, - const void __user *, unsafe_ptr) +static __always_inline int +bpf_probe_read_user_str_common(void *dst, u32 size, + const void __user *unsafe_ptr) { - int ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); + int ret; + ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - return ret; } +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_str_common(dst, size, unsafe_ptr); +} + const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, @@ -177,25 +190,25 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = { }; static __always_inline int -bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr, - const bool compat) +bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = security_locked_down(LOCKDOWN_BPF_READ); if (unlikely(ret < 0)) - goto out; - ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) : - probe_kernel_read_strict(dst, unsafe_ptr, size); + goto fail; + ret = probe_kernel_read_strict(dst, unsafe_ptr, size); if (unlikely(ret < 0)) -out: - memset(dst, 0, size); + goto fail; + return ret; +fail: + memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_proto = { @@ -207,50 +220,37 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, - const void *, unsafe_ptr) -{ - return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true); -} - -static const struct bpf_func_proto bpf_probe_read_compat_proto = { - .func = bpf_probe_read_compat, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_ANYTHING, -}; - static __always_inline int -bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, - const bool compat) +bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = security_locked_down(LOCKDOWN_BPF_READ); if (unlikely(ret < 0)) - goto out; + goto fail; + /* - * The strncpy_from_unsafe_*() call will likely not fill the entire - * buffer, but that's okay in this circumstance as we're probing + * The strncpy_from_kernel_nofault() call will likely not fill the + * entire buffer, but that's okay in this circumstance as we're probing * arbitrary memory anyway similar to bpf_probe_read_*() and might * as well probe the stack. Thus, memory is explicitly cleared * only in error case, so that improper users ignoring return * code altogether don't copy garbage; otherwise length of string * is returned that can be used for bpf_perf_event_output() et al. */ - ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : - strncpy_from_kernel_nofault(dst, unsafe_ptr, size); + ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) -out: - memset(dst, 0, size); + goto fail; + + return 0; +fail: + memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { @@ -262,10 +262,34 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE +BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); +} + +static const struct bpf_func_proto bpf_probe_read_compat_proto = { + .func = bpf_probe_read_compat, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true); + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_str_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { @@ -276,6 +300,7 @@ static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; +#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) -- cgit v1.2.3 From 9de1fec50b23117f0a19f7609cc837ca72e764a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:44 -0700 Subject: tracing/kprobes: handle mixed kernel/userspace probes better Instead of using the dangerous probe_kernel_read and strncpy_from_unsafe helpers, rework probes to try a user probe based on the address if the architecture has a common address space for kernel and userspace. [svens@linux.ibm.com:use strncpy_from_kernel_nofault() in fetch_store_string()] Link: http://lkml.kernel.org/r/20200606181903.49384-1-svens@linux.ibm.com Signed-off-by: Christoph Hellwig Signed-off-by: Sven Schnelle Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-15-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/trace_kprobe.c | 72 +++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4325f9e7fada..6a23b45613a2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1200,6 +1200,15 @@ static const struct file_operations kprobe_profile_ops = { /* Kprobe specific fetch functions */ +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) @@ -1207,30 +1216,27 @@ fetch_store_strlen(unsigned long addr) int ret, len = 0; u8 c; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + do { - ret = probe_kernel_read(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read_strict(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); return (ret < 0) ? ret : len; } -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen_user(unsigned long addr) -{ - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); -} - /* - * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max - * length and relative data location. + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. */ static nokprobe_inline int -fetch_store_string(unsigned long addr, void *dest, void *base) +fetch_store_string_user(unsigned long addr, void *dest, void *base) { + const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); void *__dest; long ret; @@ -1240,11 +1246,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen); + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); @@ -1252,35 +1254,37 @@ fetch_store_string(unsigned long addr, void *dest, void *base) } /* - * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf - * with max length and relative data location. + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. */ static nokprobe_inline int -fetch_store_string_user(unsigned long addr, void *dest, void *base) +fetch_store_string(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); void *__dest; long ret; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + if (unlikely(!maxlen)) return -ENOMEM; __dest = get_loc_data(dest, base); - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } -static nokprobe_inline int -probe_mem_read(void *dest, void *src, size_t size) -{ - return probe_kernel_read(dest, src, size); -} - static nokprobe_inline int probe_mem_read_user(void *dest, void *src, size_t size) { @@ -1289,6 +1293,16 @@ probe_mem_read_user(void *dest, void *src, size_t size) return probe_user_read(dest, uaddr, size); } +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return probe_kernel_read_strict(dest, src, size); +} + /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, -- cgit v1.2.3 From 98a23609b10364a51a1bb3688f8dd1cd1aa94a9a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2020 21:34:50 -0700 Subject: maccess: always use strict semantics for probe_kernel_read Except for historical confusion in the kprobes/uprobes and bpf tracers, which has been fixed now, there is no good reason to ever allow user memory accesses from probe_kernel_read. Switch probe_kernel_read to only read from kernel memory. [akpm@linux-foundation.org: update it for "mm, dump_page(): do not crash with invalid mapping pointer"] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200521152301.2587579-17-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/trace/bpf_trace.c | 2 +- kernel/trace/trace_kprobe.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 59cb19888891..e729c9e587a0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -196,7 +196,7 @@ bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) if (unlikely(ret < 0)) goto fail; - ret = probe_kernel_read_strict(dst, unsafe_ptr, size); + ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) goto fail; return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6a23b45613a2..ea8d0b094f1b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1222,7 +1222,7 @@ fetch_store_strlen(unsigned long addr) #endif do { - ret = probe_kernel_read_strict(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); @@ -1300,7 +1300,7 @@ probe_mem_read(void *dest, void *src, size_t size) if ((unsigned long)src < TASK_SIZE) return probe_mem_read_user(dest, src, size); #endif - return probe_kernel_read_strict(dest, src, size); + return probe_kernel_read(dest, src, size); } /* Note that we don't verify it, since the code does not come from user space */ -- cgit v1.2.3 From 013b2deba9a6b80ca02f4fafd7dedf875e9b4450 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 May 2020 18:47:25 +0200 Subject: uprobes: ensure that uprobe->offset and ->ref_ctr_offset are properly aligned uprobe_write_opcode() must not cross page boundary; prepare_uprobe() relies on arch_uprobe_analyze_insn() which should validate "vaddr" but some architectures (csky, s390, and sparc) don't do this. We can remove the BUG_ON() check in prepare_uprobe() and validate the offset early in __uprobe_register(). The new IS_ALIGNED() check matches the alignment check in arch_prepare_kprobe() on supported architectures, so I think that all insns must be aligned to UPROBE_SWBP_INSN_SIZE. Another problem is __update_ref_ctr() which was wrong from the very beginning, it can read/write outside of kmap'ed page unless "vaddr" is aligned to sizeof(short), __uprobe_register() should check this too. Reported-by: Linus Torvalds Suggested-by: Linus Torvalds Signed-off-by: Oleg Nesterov Reviewed-by: Srikar Dronamraju Acked-by: Christian Borntraeger Tested-by: Sven Schnelle Cc: Steven Rostedt Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e51ec844c87c..5dc2f46f38c0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -861,10 +861,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (ret) goto out; - /* uprobe_write_opcode() assumes we don't cross page boundary */ - BUG_ON((uprobe->offset & ~PAGE_MASK) + - UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); @@ -1160,6 +1156,15 @@ static int __uprobe_register(struct inode *inode, loff_t offset, if (offset > i_size_read(inode)) return -EINVAL; + /* + * This ensures that copy_from_page(), copy_to_page() and + * __update_ref_ctr() can't cross page boundary. + */ + if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) + return -EINVAL; + if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) + return -EINVAL; + retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) @@ -2008,6 +2013,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) + return -EINVAL; + pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); -- cgit v1.2.3 From 22d5bd6867364b41576a712755271a7d6161abd6 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 8 Jun 2020 14:45:32 +0200 Subject: tracing/probe: Fix bpf_task_fd_query() for kprobes and uprobes Commit 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") removed the trace_[ku]probe structure from the trace_event_call->data pointer. As bpf_get_[ku]probe_info() were forgotten in that change, fix them now. These functions are currently only used by the bpf_task_fd_query() syscall handler to collect information about a perf event. Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20200608124531.819838-1-jean-philippe@linaro.org --- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_uprobe.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35989383ae11..8eeb95e04bf5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1629,7 +1629,7 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tk = find_trace_kprobe(pevent, group); else - tk = event->tp_event->data; + tk = trace_kprobe_primary_from_call(event->tp_event); if (!tk) return -EINVAL; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a8e8e9c1c75..fdd47f99b18f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1412,7 +1412,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tu = find_probe_event(pevent, group); else - tu = event->tp_event->data; + tu = trace_uprobe_primary_from_call(event->tp_event); if (!tu) return -EINVAL; -- cgit v1.2.3 From 26afa0a4eb3fd87757f9de56ec5db5a03b14e120 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 8 Jun 2020 09:17:23 -0600 Subject: bpf: Reset data_meta before running programs attached to devmap entry This is a new context that does not handle metadata at the moment, so mark data_meta invalid. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200608151723.9539-1-dsahern@kernel.org --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 854b09beb16b..bfdff2faf5cb 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -479,6 +479,7 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp_set_data_meta_invalid(xdp); xdp->txq = &txq; act = bpf_prog_run_xdp(xdp_prog, xdp); -- cgit v1.2.3 From 248e00ac47d64e153b9c50f45aad73cd61894a73 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 8 Jun 2020 17:22:01 +0100 Subject: bpf: cgroup: Allow multi-attach program to replace itself When using BPF_PROG_ATTACH to attach a program to a cgroup in BPF_F_ALLOW_MULTI mode, it is not possible to replace a program with itself. This is because the check for duplicate programs doesn't take the replacement program into account. Replacing a program with itself might seem weird, but it has some uses: first, it allows resetting the associated cgroup storage. Second, it makes the API consistent with the non-ALLOW_MULTI usage, where it is possible to replace a program with itself. Third, it aligns BPF_PROG_ATTACH with bpf_link, where replacing itself is also supported. Sice this code has been refactored a few times this change will only apply to v5.7 and later. Adjustments could be made to commit 1020c1f24a94 ("bpf: Simplify __cgroup_bpf_attach") and commit d7bf2c10af05 ("bpf: allocate cgroup storage entries on attaching bpf programs") as well as commit 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200608162202.94002-1-lmb@cloudflare.com --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fdf7836750a3..4d76f16524cc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -378,7 +378,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, } list_for_each_entry(pl, progs, node) { - if (prog && pl->prog == prog) + if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); if (link && pl->link == link) -- cgit v1.2.3 From 281920b7e0b31e0a7706433ff58e7d52ac97c327 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Jun 2020 15:31:46 +0200 Subject: bpf: Devmap adjust uapi for attach bpf program V2: - Defer changing BPF-syscall to start at file-descriptor 1 - Use {} to zero initialise struct. The recent commit fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry"), introduced ability to attach (and run) a separate XDP bpf_prog for each devmap entry. A bpf_prog is added via a file-descriptor. As zero were a valid FD, not using the feature requires using value minus-1. The UAPI is extended via tail-extending struct bpf_devmap_val and using map->value_size to determine the feature set. This will break older userspace applications not using the bpf_prog feature. Consider an old userspace app that is compiled against newer kernel uapi/bpf.h, it will not know that it need to initialise the member bpf_prog.fd to minus-1. Thus, users will be forced to update source code to get program running on newer kernels. This patch remove the minus-1 checks, and have zero mean feature isn't used. Followup patches either for kernel or libbpf should handle and avoid returning file-descriptor zero in the first place. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159170950687.2102545.7235914718298050113.stgit@firesoul --- kernel/bpf/devmap.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index bfdff2faf5cb..0cbb72cdaf63 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,15 +60,6 @@ struct xdp_dev_bulk_queue { unsigned int count; }; -/* DEVMAP values */ -struct bpf_devmap_val { - u32 ifindex; /* device index */ - union { - int fd; /* prog fd on map write */ - u32 id; /* prog id on map read */ - } bpf_prog; -}; - struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; @@ -619,7 +610,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; - if (val->bpf_prog.fd >= 0) { + if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) @@ -653,8 +644,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -670,7 +661,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ - if (val.bpf_prog.fd != -1) + if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); @@ -700,8 +691,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; -- cgit v1.2.3 From 3021e69219e2f3df6d01243000db32d1325cdd0d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 10 Jun 2020 18:41:28 -0700 Subject: kcov: check kcov_softirq in kcov_remote_stop() kcov_remote_stop() should check that the corresponding kcov_remote_start() actually found the specified remote handle and started collecting coverage. This is done by checking the per thread kcov_softirq flag. A particular failure scenario where this was observed involved a softirq with a remote coverage collection section coming between check_kcov_mode() and the access to t->kcov_area in __sanitizer_cov_trace_pc(). In that softirq kcov_remote_start() bailed out after kcov_remote_find() check, but the matching kcov_remote_stop() didn't check if kcov_remote_start() succeeded, and overwrote per thread kcov parameters with invalid (zero) values. Fixes: 5ff3b30ab57d ("kcov: collect coverage from interrupts") Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Marco Elver Cc: Tetsuo Handa Link: http://lkml.kernel.org/r/fcd1cd16eac1d2c01a66befd8ea4afc6f8d09833.1591576806.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 55c5d883a93e..6afae0bcbac4 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -427,7 +427,8 @@ void kcov_task_exit(struct task_struct *t) * WARN_ON(!kcov->remote && kcov->t != t); * * For KCOV_REMOTE_ENABLE devices, the exiting task is either: - * 2. A remote task between kcov_remote_start() and kcov_remote_stop(). + * + * 1. A remote task between kcov_remote_start() and kcov_remote_stop(). * In this case we should print a warning right away, since a task * shouldn't be exiting when it's in a kcov coverage collection * section. Here t points to the task that is collecting remote @@ -437,7 +438,7 @@ void kcov_task_exit(struct task_struct *t) * WARN_ON(kcov->remote && kcov->t != t); * * 2. The task that created kcov exiting without calling KCOV_DISABLE, - * and then again we can make sure that t->kcov->t == t: + * and then again we make sure that t->kcov->t == t: * WARN_ON(kcov->remote && kcov->t != t); * * By combining all three checks into one we get: @@ -764,7 +765,7 @@ static const struct file_operations kcov_fops = { * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to - * be collected via __sanitizer_cov_trace_pc() + * be collected via __sanitizer_cov_trace_pc(). * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ @@ -972,16 +973,25 @@ void kcov_remote_stop(void) local_irq_restore(flags); return; } - kcov = t->kcov; - area = t->kcov_area; - size = t->kcov_size; - sequence = t->kcov_sequence; - + /* + * When in softirq, check if the corresponding kcov_remote_start() + * actually found the remote handle and started collecting coverage. + */ + if (in_serving_softirq() && !t->kcov_softirq) { + local_irq_restore(flags); + return; + } + /* Make sure that kcov_softirq is only set when in softirq. */ if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { local_irq_restore(flags); return; } + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; + kcov_stop(t); if (in_serving_softirq()) { t->kcov_softirq = 0; -- cgit v1.2.3 From 9bf5b9eb232b34738800868e30bea3bad4a6a1ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jun 2020 18:41:59 -0700 Subject: kernel: move use_mm/unuse_mm to kthread.c Patch series "improve use_mm / unuse_mm", v2. This series improves the use_mm / unuse_mm interface by better documenting the assumptions, and my taking the set_fs manipulations spread over the callers into the core API. This patch (of 3): Use the proper API instead. Link: http://lkml.kernel.org/r/20200404094101.672954-1-hch@lst.de These helpers are only for use with kernel threads, and I will tie them more into the kthread infrastructure going forward. Also move the prototypes to kthread.h - mmu_context.h was a little weird to start with as it otherwise contains very low-level MM bits. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Tested-by: Jens Axboe Reviewed-by: Jens Axboe Acked-by: Felix Kuehling Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Zhenyu Wang Cc: Zhi Wang Cc: Greg Kroah-Hartman Link: http://lkml.kernel.org/r/20200404094101.672954-1-hch@lst.de Link: http://lkml.kernel.org/r/20200416053158.586887-1-hch@lst.de Link: http://lkml.kernel.org/r/20200404094101.672954-5-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/kthread.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index bfbfa481be3a..ce4610316377 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,13 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. + * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include +#include +#include #include +#include #include #include #include @@ -25,6 +29,7 @@ #include #include + static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; @@ -1203,6 +1208,57 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + active_mm = tsk->active_mm; + if (active_mm != mm) { + mmgrab(mm); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif + + if (active_mm != mm) + mmdrop(active_mm); +} +EXPORT_SYMBOL_GPL(use_mm); + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + sync_mm_rss(mm); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} +EXPORT_SYMBOL_GPL(unuse_mm); + #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread -- cgit v1.2.3 From f5678e7f2ac31c270334b936352f0ef2fe7dd2b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jun 2020 18:42:06 -0700 Subject: kernel: better document the use_mm/unuse_mm API contract Switch the function documentation to kerneldoc comments, and add WARN_ON_ONCE asserts that the calling thread is a kernel thread and does not have ->mm set (or has ->mm set in the case of unuse_mm). Also give the functions a kthread_ prefix to better document the use case. [hch@lst.de: fix a comment typo, cover the newly merged use_mm/unuse_mm caller in vfio] Link: http://lkml.kernel.org/r/20200416053158.586887-3-hch@lst.de [sfr@canb.auug.org.au: powerpc/vas: fix up for {un}use_mm() rename] Link: http://lkml.kernel.org/r/20200422163935.5aa93ba5@canb.auug.org.au Signed-off-by: Christoph Hellwig Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Tested-by: Jens Axboe Reviewed-by: Jens Axboe Acked-by: Felix Kuehling Acked-by: Greg Kroah-Hartman [usb] Acked-by: Haren Myneni Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Zhenyu Wang Cc: Zhi Wang Link: http://lkml.kernel.org/r/20200404094101.672954-6-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/kthread.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index ce4610316377..8ed4b4fbec7c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1208,18 +1208,18 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); -/* - * use_mm - * Makes the calling kernel thread take on the specified - * mm context. - * (Note: this routine is intended to be called only - * from a kernel thread context) +/** + * kthread_use_mm - make the calling kthread operate on an address space + * @mm: address space to operate on */ -void use_mm(struct mm_struct *mm) +void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; + WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); + WARN_ON_ONCE(tsk->mm); + task_lock(tsk); active_mm = tsk->active_mm; if (active_mm != mm) { @@ -1236,20 +1236,19 @@ void use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); } -EXPORT_SYMBOL_GPL(use_mm); +EXPORT_SYMBOL_GPL(kthread_use_mm); -/* - * unuse_mm - * Reverses the effect of use_mm, i.e. releases the - * specified mm context which was earlier taken on - * by the calling kernel thread - * (Note: this routine is intended to be called only - * from a kernel thread context) +/** + * kthread_unuse_mm - reverse the effect of kthread_use_mm() + * @mm: address space to operate on */ -void unuse_mm(struct mm_struct *mm) +void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; + WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); + WARN_ON_ONCE(!tsk->mm); + task_lock(tsk); sync_mm_rss(mm); tsk->mm = NULL; @@ -1257,7 +1256,7 @@ void unuse_mm(struct mm_struct *mm) enter_lazy_tlb(mm, tsk); task_unlock(tsk); } -EXPORT_SYMBOL_GPL(unuse_mm); +EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** -- cgit v1.2.3 From 37c54f9bd48663f7657a9178fe08c47e4f5b537b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jun 2020 18:42:10 -0700 Subject: kernel: set USER_DS in kthread_use_mm Some architectures like arm64 and s390 require USER_DS to be set for kernel threads to access user address space, which is the whole purpose of kthread_use_mm, but other like x86 don't. That has lead to a huge mess where some callers are fixed up once they are tested on said architectures, while others linger around and yet other like io_uring try to do "clever" optimizations for what usually is just a trivial asignment to a member in the thread_struct for most architectures. Make kthread_use_mm set USER_DS, and kthread_unuse_mm restore to the previous value instead. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Tested-by: Jens Axboe Reviewed-by: Jens Axboe Acked-by: Michael S. Tsirkin Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Felix Kuehling Cc: Jason Wang Cc: Zhenyu Wang Cc: Zhi Wang Cc: Greg Kroah-Hartman Link: http://lkml.kernel.org/r/20200404094101.672954-7-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/kthread.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 8ed4b4fbec7c..86357cd38eb2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,6 +52,7 @@ struct kthread { unsigned long flags; unsigned int cpu; void *data; + mm_segment_t oldfs; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP @@ -1235,6 +1236,9 @@ void kthread_use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); + + to_kthread(tsk)->oldfs = get_fs(); + set_fs(USER_DS); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1249,6 +1253,8 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); + set_fs(to_kthread(tsk)->oldfs); + task_lock(tsk); sync_mm_rss(mm); tsk->mm = NULL; -- cgit v1.2.3 From 2a9e5ded9543436620a7fbc9329ddcc32bf97bc7 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 20 May 2020 12:22:33 +0200 Subject: printk/kdb: Redirect printk messages into kdb in any context kdb has to get messages on consoles even when the system is stopped. It uses kdb_printf() internally and calls console drivers on its own. It uses a hack to reuse an existing code. It sets "kdb_trap_printk" global variable to redirect even the normal printk() into the kdb_printf() variant. The variable "kdb_trap_printk" is checked in printk_default() and it is ignored when printk is redirected to printk_safe in NMI context. Solve this by moving the check into printk_func(). It is obvious that it is not fully safe. But it does not make things worse. The console drivers are already called in this context by db_printf() direct calls. Reported-by: Sumit Garg Tested-by: Sumit Garg Reviewed-by: Daniel Thompson Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200520102233.GC3464@linux-b0ei --- kernel/printk/printk.c | 14 +------------- kernel/printk/printk_safe.c | 7 +++++++ 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9fdd6a42ad6a..2167bb528dd3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -2047,18 +2046,7 @@ EXPORT_SYMBOL(vprintk); int vprintk_default(const char *fmt, va_list args) { - int r; - -#ifdef CONFIG_KGDB_KDB - /* Allow to pass printk() to kdb but avoid a recursion. */ - if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { - r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); - return r; - } -#endif - r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); - - return r; + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d9a659a686f3..7ccb821d0bfe 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -359,6 +360,12 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { +#ifdef CONFIG_KGDB_KDB + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) + return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +#endif + /* * Try to use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. -- cgit v1.2.3 From 0372007f5a79d61d3cb48a507717b9afb5d6addd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 11:05:22 +0100 Subject: context_tracking: Ensure that the critical path cannot be instrumented context tracking lacks a few protection mechanisms against instrumentation: - While the core functions are marked NOKPROBE they lack protection against function tracing which is required as the function entry/exit points can be utilized by BPF. - static functions invoked from the protected functions need to be marked as well as they can be instrumented otherwise. - using plain inline allows the compiler to emit traceable and probable functions. Fix this by marking the functions noinstr and converting the plain inlines to __always_inline. The NOKPROBE_SYMBOL() annotations are removed as the .noinstr.text section is already excluded from being probed. Cures the following objtool warnings: vmlinux.o: warning: objtool: enter_from_user_mode()+0x34: call to __context_tracking_exit() leaves .noinstr.text section vmlinux.o: warning: objtool: prepare_exit_to_usermode()+0x29: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_return_slowpath()+0x29: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_syscall_64()+0x7f: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_int80_syscall_32()+0x3d: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_fast_syscall_32()+0x9c: call to __context_tracking_enter() leaves .noinstr.text section and generates new ones... Signed-off-by: Thomas Gleixner Reviewed-by: Masami Hiramatsu Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.811520478@linutronix.de --- kernel/context_tracking.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index ce430885c26c..36a98c48aedc 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); -static bool context_tracking_recursion_enter(void) +static noinstr bool context_tracking_recursion_enter(void) { int recursion; @@ -45,7 +45,7 @@ static bool context_tracking_recursion_enter(void) return false; } -static void context_tracking_recursion_exit(void) +static __always_inline void context_tracking_recursion_exit(void) { __this_cpu_dec(context_tracking.recursion); } @@ -59,7 +59,7 @@ static void context_tracking_recursion_exit(void) * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void __context_tracking_enter(enum ctx_state state) +void noinstr __context_tracking_enter(enum ctx_state state) { /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); @@ -77,8 +77,10 @@ void __context_tracking_enter(enum ctx_state state) * on the tick. */ if (state == CONTEXT_USER) { + instrumentation_begin(); trace_user_enter(0); vtime_user_enter(current); + instrumentation_end(); } rcu_user_enter(); } @@ -99,7 +101,6 @@ void __context_tracking_enter(enum ctx_state state) } context_tracking_recursion_exit(); } -NOKPROBE_SYMBOL(__context_tracking_enter); EXPORT_SYMBOL_GPL(__context_tracking_enter); void context_tracking_enter(enum ctx_state state) @@ -142,7 +143,7 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void __context_tracking_exit(enum ctx_state state) +void noinstr __context_tracking_exit(enum ctx_state state) { if (!context_tracking_recursion_enter()) return; @@ -155,15 +156,16 @@ void __context_tracking_exit(enum ctx_state state) */ rcu_user_exit(); if (state == CONTEXT_USER) { + instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); + instrumentation_end(); } } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); } -NOKPROBE_SYMBOL(__context_tracking_exit); EXPORT_SYMBOL_GPL(__context_tracking_exit); void context_tracking_exit(enum ctx_state state) -- cgit v1.2.3 From 5916d5f9b3347344a3d96ba6b54cf8e290eba96a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Mar 2020 13:49:51 +0100 Subject: bug: Annotate WARN/BUG/stackfail as noinstr safe Warnings, bugs and stack protection fails from noinstr sections, e.g. low level and early entry code, are likely to be fatal. Mark them as "safe" to be invoked from noinstr protected code to avoid annotating all usage sites. Getting the information out is important. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.376598577@linutronix.de --- kernel/panic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 85568bbfb12b..e2157ca387c8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -680,10 +680,12 @@ device_initcall(register_warn_debugfs); * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ -__visible void __stack_chk_fail(void) +__visible noinstr void __stack_chk_fail(void) { + instrumentation_begin(); panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); -- cgit v1.2.3 From 865d3a9afe7eddf320e7f61a442864d6efe27505 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 21:22:36 +0200 Subject: x86/mce: Address objtools noinstr complaints Mark the relevant functions noinstr, use the plain non-instrumented MSR accessors. The only odd part is the instrumentation_begin()/end() pair around the indirect machine_check_vector() call as objtool can't figure that out. The possible invoked functions are annotated correctly. Also use notrace variant of nmi_enter/exit(). If MCEs happen then hardware latency tracing is the least of the worries. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.476734898@linutronix.de --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9ebaab13339d..d20d489841c8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -953,7 +953,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); * but without the sequence counter protect. This internal function * is called just when timekeeping lock is already held. */ -time64_t __ktime_get_real_seconds(void) +noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; -- cgit v1.2.3 From 8a6bc4787f05d087fda8e11ead225c8830250703 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:21 +0200 Subject: genirq: Provide irq_enter/exit_rcu() irq_enter()/exit() currently include RCU handling. To properly separate the RCU handling code, provide variants which contain only the non-RCU related functionality. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.567023613@linutronix.de --- kernel/softirq.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index a47c6dd57452..beb8e3a66c7c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -339,12 +339,11 @@ asmlinkage __visible void do_softirq(void) local_irq_restore(flags); } -/* - * Enter an interrupt context. +/** + * irq_enter_rcu - Enter an interrupt context with RCU watching */ -void irq_enter(void) +void irq_enter_rcu(void) { - rcu_irq_enter(); if (is_idle_task(current) && !in_interrupt()) { /* * Prevent raise_softirq from needlessly waking up ksoftirqd @@ -354,10 +353,18 @@ void irq_enter(void) tick_irq_enter(); _local_bh_enable(); } - __irq_enter(); } +/** + * irq_enter - Enter an interrupt context including RCU update + */ +void irq_enter(void) +{ + rcu_irq_enter(); + irq_enter_rcu(); +} + static inline void invoke_softirq(void) { if (ksoftirqd_running(local_softirq_pending())) @@ -397,10 +404,12 @@ static inline void tick_irq_exit(void) #endif } -/* - * Exit an interrupt context. Process softirqs if needed and possible: +/** + * irq_exit_rcu() - Exit an interrupt context without updating RCU + * + * Also processes softirqs if needed and possible. */ -void irq_exit(void) +void irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); @@ -413,6 +422,16 @@ void irq_exit(void) invoke_softirq(); tick_irq_exit(); +} + +/** + * irq_exit - Exit an interrupt context, update RCU and lockdep + * + * Also processes softirqs if needed and possible. + */ +void irq_exit(void) +{ + irq_exit_rcu(); rcu_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); -- cgit v1.2.3 From 59bc300b712998d10254ee20e24f2e7ec09c560a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:39 +0200 Subject: x86/entry: Clarify irq_{enter,exit}_rcu() Because: irq_enter_rcu() includes lockdep_hardirq_enter() irq_exit_rcu() does *NOT* include lockdep_hardirq_exit() Which resulted in two 'stray' lockdep_hardirq_exit() calls in idtentry.h, and me spending a long time trying to find the matching enter calls. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.359433429@infradead.org --- kernel/softirq.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index beb8e3a66c7c..c4201b7f42b1 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -404,12 +404,7 @@ static inline void tick_irq_exit(void) #endif } -/** - * irq_exit_rcu() - Exit an interrupt context without updating RCU - * - * Also processes softirqs if needed and possible. - */ -void irq_exit_rcu(void) +static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); @@ -424,6 +419,18 @@ void irq_exit_rcu(void) tick_irq_exit(); } +/** + * irq_exit_rcu() - Exit an interrupt context without updating RCU + * + * Also processes softirqs if needed and possible. + */ +void irq_exit_rcu(void) +{ + __irq_exit_rcu(); + /* must be last! */ + lockdep_hardirq_exit(); +} + /** * irq_exit - Exit an interrupt context, update RCU and lockdep * @@ -431,7 +438,7 @@ void irq_exit_rcu(void) */ void irq_exit(void) { - irq_exit_rcu(); + __irq_exit_rcu(); rcu_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); -- cgit v1.2.3 From bf2b3008440072068580c609d79a079656af0588 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:40 +0200 Subject: x86/entry: Rename trace_hardirqs_off_prepare() The typical pattern for trace_hardirqs_off_prepare() is: ENTRY lockdep_hardirqs_off(); // because hardware ... do entry magic instrumentation_begin(); trace_hardirqs_off_prepare(); ... do actual work trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); ... do exit magic lockdep_hardirqs_on(); which shows that it's named wrong, rename it to trace_hardirqs_off_finish(), as it concludes the hardirq_off transition. Also, given that the above is the only correct order, make the traditional all-in-one trace_hardirqs_off() follow suit. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.415774872@infradead.org --- kernel/trace/trace_preemptirq.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index fb0691b8a88d..f10073e62603 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -58,7 +58,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on); * and lockdep uses a staged approach which splits the lockdep hardirq * tracking into a RCU on and a RCU off section. */ -void trace_hardirqs_off_prepare(void) +void trace_hardirqs_off_finish(void) { if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); @@ -68,19 +68,19 @@ void trace_hardirqs_off_prepare(void) } } -EXPORT_SYMBOL(trace_hardirqs_off_prepare); -NOKPROBE_SYMBOL(trace_hardirqs_off_prepare); +EXPORT_SYMBOL(trace_hardirqs_off_finish); +NOKPROBE_SYMBOL(trace_hardirqs_off_finish); void trace_hardirqs_off(void) { + lockdep_hardirqs_off(CALLER_ADDR0); + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); } - - lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); NOKPROBE_SYMBOL(trace_hardirqs_off); -- cgit v1.2.3 From 6eebad1ad303db360ebe3e51c2b9656c3d407157 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:21 +0200 Subject: lockdep: __always_inline more for noinstr vmlinux.o: warning: objtool: debug_locks_off()+0xd: call to __debug_locks_off() leaves .noinstr.text section vmlinux.o: warning: objtool: match_held_lock()+0x6a: call to look_up_lock_class.isra.0() leaves .noinstr.text section vmlinux.o: warning: objtool: lock_is_held_type()+0x90: call to lockdep_recursion_finish() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114052.185201076@infradead.org --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 38cce34d03dc..29a8de4c50b9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,7 +393,7 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -static inline void lockdep_recursion_finish(void) +static __always_inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) current->lockdep_recursion = 0; @@ -801,7 +801,7 @@ static int count_matching_names(struct lock_class *new_class) } /* used from NMI context -- must be lockless */ -static inline struct lock_class * +static __always_inline struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; -- cgit v1.2.3 From 75d75b7a4d5489cc6a5e91ace306f6c13f376f33 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 21 May 2020 16:20:39 +0200 Subject: kcsan: Support distinguishing volatile accesses In the kernel, the "volatile" keyword is used in various concurrent contexts, whether in low-level synchronization primitives or for legacy reasons. If supported by the compiler, it will be assumed that aligned volatile accesses up to sizeof(long long) (matching compiletime_assert_rwonce_type()) are atomic. Recent versions of Clang [1] (GCC tentative [2]) can instrument volatile accesses differently. Add the option (required) to enable the instrumentation, and provide the necessary runtime functions. None of the updated compilers are widely available yet (Clang 11 will be the first release to support the feature). [1] https://github.com/llvm/llvm-project/commit/5a2c31116f412c3b6888be361137efd705e05814 [2] https://gcc.gnu.org/pipermail/gcc-patches/2020-April/544452.html This change allows removing of any explicit checks in primitives such as READ_ONCE() and WRITE_ONCE(). [ bp: Massage commit message a bit. ] Signed-off-by: Marco Elver Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200521142047.169334-4-elver@google.com --- kernel/kcsan/core.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index a73a66cf79df..15f67949d11e 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -789,6 +789,49 @@ void __tsan_write_range(void *ptr, size_t size) } EXPORT_SYMBOL(__tsan_write_range); +/* + * Use of explicit volatile is generally disallowed [1], however, volatile is + * still used in various concurrent context, whether in low-level + * synchronization primitives or for legacy reasons. + * [1] https://lwn.net/Articles/233479/ + * + * We only consider volatile accesses atomic if they are aligned and would pass + * the size-check of compiletime_assert_rwonce_type(). + */ +#define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \ + void __tsan_volatile_read##size(void *ptr) \ + { \ + const bool is_atomic = size <= sizeof(long long) && \ + IS_ALIGNED((unsigned long)ptr, size); \ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \ + return; \ + check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \ + } \ + EXPORT_SYMBOL(__tsan_volatile_read##size); \ + void __tsan_unaligned_volatile_read##size(void *ptr) \ + __alias(__tsan_volatile_read##size); \ + EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \ + void __tsan_volatile_write##size(void *ptr) \ + { \ + const bool is_atomic = size <= sizeof(long long) && \ + IS_ALIGNED((unsigned long)ptr, size); \ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \ + return; \ + check_access(ptr, size, \ + KCSAN_ACCESS_WRITE | \ + (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \ + } \ + EXPORT_SYMBOL(__tsan_volatile_write##size); \ + void __tsan_unaligned_volatile_write##size(void *ptr) \ + __alias(__tsan_volatile_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_volatile_write##size) + +DEFINE_TSAN_VOLATILE_READ_WRITE(1); +DEFINE_TSAN_VOLATILE_READ_WRITE(2); +DEFINE_TSAN_VOLATILE_READ_WRITE(4); +DEFINE_TSAN_VOLATILE_READ_WRITE(8); +DEFINE_TSAN_VOLATILE_READ_WRITE(16); + /* * The below are not required by KCSAN, but can still be emitted by the * compiler. -- cgit v1.2.3 From 29fcb05bbf1a7008900bb9bee347bdbfc7171036 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jun 2020 17:21:15 -0700 Subject: bpf: Undo internal BPF_PROBE_MEM in BPF insns dump BPF_PROBE_MEM is kernel-internal implmementation details. When dumping BPF instructions to user-space, it needs to be replaced back with BPF_MEM mode. Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200613002115.1632142-1-andriin@fb.com --- kernel/bpf/syscall.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d530b1d5683..e9a3ebc00e08 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3158,6 +3158,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) struct bpf_insn *insns; u32 off, type; u64 imm; + u8 code; int i; insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), @@ -3166,21 +3167,27 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) return insns; for (i = 0; i < prog->len; i++) { - if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + code = insns[i].code; + + if (code == (BPF_JMP | BPF_TAIL_CALL)) { insns[i].code = BPF_JMP | BPF_CALL; insns[i].imm = BPF_FUNC_tail_call; /* fall-through */ } - if (insns[i].code == (BPF_JMP | BPF_CALL) || - insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { - if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + if (code == (BPF_JMP | BPF_CALL) || + code == (BPF_JMP | BPF_CALL_ARGS)) { + if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok()) insns[i].imm = 0; continue; } + if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { + insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; + continue; + } - if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; -- cgit v1.2.3 From a7f7f6248d9740d710fd6bd190293fe5e16410ac Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 14 Jun 2020 01:50:22 +0900 Subject: treewide: replace '---help---' in Kconfig files with 'help' Since commit 84af7a6194e4 ("checkpatch: kconfig: prefer 'help' over '---help---'"), the number of '---help---' has been gradually decreasing, but there are still more than 2400 instances. This commit finishes the conversion. While I touched the lines, I also fixed the indentation. There are a variety of indentation styles found. a) 4 spaces + '---help---' b) 7 spaces + '---help---' c) 8 spaces + '---help---' d) 1 space + 1 tab + '---help---' e) 1 tab + '---help---' (correct indentation) f) 1 tab + 1 space + '---help---' g) 1 tab + 2 spaces + '---help---' In order to convert all of them to 1 tab + 'help', I ran the following commend: $ find . -name 'Kconfig*' | xargs sed -i 's/^[[:space:]]*---help---/\thelp/' Signed-off-by: Masahiro Yamada --- kernel/gcov/Kconfig | 4 ++-- kernel/irq/Kconfig | 4 ++-- kernel/power/Kconfig | 26 +++++++++++++------------- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index feaad597b3f4..3110c77230c7 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -6,7 +6,7 @@ config GCOV_KERNEL depends on DEBUG_FS select CONSTRUCTORS if !UML default n - ---help--- + help This option enables gcov-based code profiling (e.g. for code coverage measurements). @@ -42,7 +42,7 @@ config GCOV_PROFILE_ALL depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n - ---help--- + help This options activates profiling for the entire kernel. If unsure, say N. diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d63c324895ea..20512252ecc9 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -118,7 +118,7 @@ config IRQ_FORCED_THREADING config SPARSE_IRQ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ - ---help--- + help Sparse irq numbering is useful for distro kernels that want to define a high CONFIG_NR_CPUS value but still want to have @@ -134,7 +134,7 @@ config GENERIC_IRQ_DEBUGFS depends on DEBUG_FS select GENERIC_IRQ_INJECTION default n - ---help--- + help Exposes internal state information through debugfs. Mostly for developers and debugging of hard to diagnose interrupt problems. diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4d0e6e815a2b..a7320f07689d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -3,7 +3,7 @@ config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE default y - ---help--- + help Allow the system to enter sleep states in which main memory is powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). @@ -42,7 +42,7 @@ config HIBERNATION select LZO_COMPRESS select LZO_DECOMPRESS select CRC32 - ---help--- + help Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the system and powers it off; and restores that checkpoint on reboot. @@ -84,7 +84,7 @@ config HIBERNATION_SNAPSHOT_DEV bool "Userspace snapshot device" depends on HIBERNATION default y - ---help--- + help Device used by the uswsusp tools. Say N if no snapshotting from userspace is needed, this also @@ -96,7 +96,7 @@ config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION default "" - ---help--- + help The default resume partition is the partition that the suspend- to-disk implementation will look for a suspended disk image. @@ -131,7 +131,7 @@ config PM_SLEEP_SMP_NONZERO_CPU def_bool y depends on PM_SLEEP_SMP depends on ARCH_SUSPEND_NONZERO_CPU - ---help--- + help If an arch can suspend (for suspend, hibernate, kexec, etc) on a non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This will allow nohz_full mask to include CPU0. @@ -140,7 +140,7 @@ config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP default n - ---help--- + help Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. @@ -148,7 +148,7 @@ config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP default n - ---help--- + help Allow user space to create, activate and deactivate wakeup source objects with the help of a sysfs-based interface. @@ -165,7 +165,7 @@ config PM_WAKELOCKS_GC config PM bool "Device power management core functionality" - ---help--- + help Enable functionality allowing I/O devices to be put into energy-saving (low power) states, for example after a specified period of inactivity (autosuspended), and woken up in response to a hardware-generated @@ -179,7 +179,7 @@ config PM config PM_DEBUG bool "Power Management Debug Support" depends on PM - ---help--- + help This option enables various debugging support in the Power Management code. This is helpful when debugging and reporting PM bugs, like suspend support. @@ -187,7 +187,7 @@ config PM_DEBUG config PM_ADVANCED_DEBUG bool "Extra PM attributes in sysfs for low-level debugging/testing" depends on PM_DEBUG - ---help--- + help Add extra sysfs attributes allowing one to access some Power Management fields of device objects from user space. If you are not a kernel developer interested in debugging/testing Power Management, say "no". @@ -195,7 +195,7 @@ config PM_ADVANCED_DEBUG config PM_TEST_SUSPEND bool "Test suspend/resume and wakealarm during bootup" depends on SUSPEND && PM_DEBUG && RTC_CLASS=y - ---help--- + help This option will let you suspend your machine during bootup, and make it wake up a few seconds later using an RTC wakeup alarm. Enable this with a kernel parameter like "test_suspend=mem". @@ -210,7 +210,7 @@ config PM_SLEEP_DEBUG config DPM_WATCHDOG bool "Device suspend/resume watchdog" depends on PM_DEBUG && PSTORE && EXPERT - ---help--- + help Sets up a watchdog timer to capture drivers that are locked up attempting to suspend/resume a device. A detected lockup causes system panic with message @@ -243,7 +243,7 @@ config PM_TRACE_RTC depends on PM_SLEEP_DEBUG depends on X86 select PM_TRACE - ---help--- + help This enables some cheesy code to save the last PM event point in the RTC across reboots, so that you can debug a machine that just hangs during suspend (or more commonly, during resume). -- cgit v1.2.3 From 39030e1351aa1aa7443bb2da24426573077c83da Mon Sep 17 00:00:00 2001 From: Thomas Cedeno Date: Tue, 9 Jun 2020 10:22:13 -0700 Subject: security: Add LSM hooks to set*gid syscalls The SafeSetID LSM uses the security_task_fix_setuid hook to filter set*uid() syscalls according to its configured security policy. In preparation for adding analagous support in the LSM for set*gid() syscalls, we add the requisite hook here. Tested by putting print statements in the security_task_fix_setgid hook and seeing them get hit during kernel boot. Signed-off-by: Thomas Cedeno Signed-off-by: Micah Morton --- kernel/sys.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..f5c06c48d585 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -393,6 +393,10 @@ long __sys_setregid(gid_t rgid, gid_t egid) new->sgid = new->egid; new->fsgid = new->egid; + retval = security_task_fix_setgid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -435,6 +439,10 @@ long __sys_setgid(gid_t gid) else goto error; + retval = security_task_fix_setgid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -756,6 +764,10 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) new->sgid = ksgid; new->fsgid = new->egid; + retval = security_task_fix_setgid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -862,7 +874,8 @@ long __sys_setfsgid(gid_t gid) ns_capable(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; - goto change_okay; + if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) + goto change_okay; } } -- cgit v1.2.3