diff options
Diffstat (limited to 'kernel')
37 files changed, 709 insertions, 363 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a5ae60f0b0a2..e6983be12bd3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,2 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o -ifdef CONFIG_TEST_BPF -obj-$(CONFIG_BPF_SYSCALL) += test_stub.o -endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 9eb4d8a7cd87..8a6616583f38 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -134,7 +134,7 @@ static void array_map_free(struct bpf_map *map) kvfree(array); } -static struct bpf_map_ops array_ops = { +static const struct bpf_map_ops array_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -143,14 +143,14 @@ static struct bpf_map_ops array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list tl = { +static struct bpf_map_type_list array_type __read_mostly = { .ops = &array_ops, .type = BPF_MAP_TYPE_ARRAY, }; static int __init register_array_map(void) { - bpf_register_map_type(&tl); + bpf_register_map_type(&array_type); return 0; } late_initcall(register_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a64e7a207d2b..4139a0f8b558 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -656,6 +656,14 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); +/* Weak definitions of helper functions in case we don't have bpf syscall. */ +const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; +const struct bpf_func_proto bpf_map_update_elem_proto __weak; +const struct bpf_func_proto bpf_map_delete_elem_proto __weak; + +const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; +const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b3ba43674310..83c209d9b17a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } -static struct bpf_map_ops htab_ops = { +static const struct bpf_map_ops htab_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list tl = { +static struct bpf_map_type_list htab_type __read_mostly = { .ops = &htab_ops, .type = BPF_MAP_TYPE_HASH, }; static int __init register_htab_map(void) { - bpf_register_map_type(&tl); + bpf_register_map_type(&htab_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9e3414d85459..bd7f5988ed9c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,8 @@ */ #include <linux/bpf.h> #include <linux/rcupdate.h> +#include <linux/random.h> +#include <linux/smp.h> /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -41,7 +43,7 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return (unsigned long) value; } -struct bpf_func_proto bpf_map_lookup_elem_proto = { +const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, @@ -60,7 +62,7 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return map->ops->map_update_elem(map, key, value, r4); } -struct bpf_func_proto bpf_map_update_elem_proto = { +const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, .ret_type = RET_INTEGER, @@ -80,10 +82,32 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return map->ops->map_delete_elem(map, key); } -struct bpf_func_proto bpf_map_delete_elem_proto = { +const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; + +static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +const struct bpf_func_proto bpf_get_prandom_u32_proto = { + .func = bpf_get_prandom_u32, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +const struct bpf_func_proto bpf_get_smp_processor_id_proto = { + .func = bpf_get_smp_processor_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..ea75c654af1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -354,10 +354,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) list_for_each_entry(tl, &bpf_prog_types, list_node) { if (tl->type == type) { prog->aux->ops = tl->ops; - prog->aux->prog_type = type; + prog->type = type; return 0; } } + return -EINVAL; } @@ -418,6 +419,7 @@ void bpf_prog_put(struct bpf_prog *prog) bpf_prog_free(prog); } } +EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) { @@ -465,6 +467,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) fdput(f); return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_get); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD log_buf @@ -508,7 +511,7 @@ static int bpf_prog_load(union bpf_attr *attr) prog->jited = false; atomic_set(&prog->aux->refcnt, 1); - prog->aux->is_gpl_compatible = is_gpl; + prog->gpl_compatible = is_gpl; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); @@ -516,8 +519,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(prog, attr); - + err = bpf_check(&prog, attr); if (err < 0) goto free_used_maps; @@ -528,7 +530,6 @@ static int bpf_prog_load(union bpf_attr *attr) bpf_prog_select_runtime(prog); err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); - if (err < 0) /* failed to allocate fd */ goto free_used_maps; diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c deleted file mode 100644 index 0ceae1e6e8b5..000000000000 --- a/kernel/bpf/test_stub.c +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/err.h> -#include <linux/bpf.h> - -/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC - * to be used by user space verifier testsuite - */ -struct bpf_context { - u64 arg1; - u64 arg2; -}; - -static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - default: - return NULL; - } -} - -static const struct bpf_context_access { - int size; - enum bpf_access_type type; -} test_ctx_access[] = { - [offsetof(struct bpf_context, arg1)] = { - FIELD_SIZEOF(struct bpf_context, arg1), - BPF_READ - }, - [offsetof(struct bpf_context, arg2)] = { - FIELD_SIZEOF(struct bpf_context, arg2), - BPF_READ - }, -}; - -static bool test_is_valid_access(int off, int size, enum bpf_access_type type) -{ - const struct bpf_context_access *access; - - if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) - return false; - - access = &test_ctx_access[off]; - if (access->size == size && (access->type & type)) - return true; - - return false; -} - -static struct bpf_verifier_ops test_ops = { - .get_func_proto = test_func_proto, - .is_valid_access = test_is_valid_access, -}; - -static struct bpf_prog_type_list tl_prog = { - .ops = &test_ops, - .type = BPF_PROG_TYPE_UNSPEC, -}; - -static int __init register_test_ops(void) -{ - bpf_register_prog_type(&tl_prog); - return 0; -} -late_initcall(register_test_ops); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a28e09c7825d..630a7bac1e51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -755,7 +755,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, enum bpf_reg_type expected_type; int err = 0; - if (arg_type == ARG_ANYTHING) + if (arg_type == ARG_DONTCARE) return 0; if (reg->type == NOT_INIT) { @@ -763,6 +763,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } + if (arg_type == ARG_ANYTHING) + return 0; + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; @@ -770,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; + } else if (arg_type == ARG_PTR_TO_CTX) { + expected_type = PTR_TO_CTX; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -852,7 +857,7 @@ static int check_call(struct verifier_env *env, int func_id) } /* eBPF programs must be GPL compatible to use GPL-ed functions */ - if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { + if (!env->prog->gpl_compatible && fn->gpl_only) { verbose("cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1172,6 +1177,18 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +static bool may_access_skb(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + return true; + default: + return false; + } +} + /* verify safety of LD_ABS|LD_IND instructions: * - they can only appear in the programs where ctx == skb * - since they are wrappers of function calls, they scratch R1-R5 registers, @@ -1194,8 +1211,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *reg; int i, err; - if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { - verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + if (!may_access_skb(env->prog->type)) { + verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); return -EINVAL; } @@ -1606,11 +1623,10 @@ static int do_check(struct verifier_env *env) return err; } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0) { - verbose("BPF_LDX uses reserved fields\n"); - return -EINVAL; - } + enum bpf_reg_type src_reg_type; + + /* check for reserved fields is already done */ + /* check src operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) @@ -1629,6 +1645,29 @@ static int do_check(struct verifier_env *env) if (err) return err; + src_reg_type = regs[insn->src_reg].type; + + if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) { + /* saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * use reserved 'imm' field to mark this insn + */ + insn->imm = src_reg_type; + + } else if (src_reg_type != insn->imm && + (src_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + /* ABuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -1776,6 +1815,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) int i, j; for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0)) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_map *map; struct fd f; @@ -1867,6 +1913,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } +static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +{ + struct bpf_insn *insn = prog->insnsi; + int insn_cnt = prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) != BPF_JMP || + BPF_OP(insn->code) == BPF_CALL || + BPF_OP(insn->code) == BPF_EXIT) + continue; + + /* adjust offset of jmps if necessary */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos && i + insn->off + 1 < pos) + insn->off -= delta; + } +} + +/* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +static int convert_ctx_accesses(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + u32 cnt; + int i; + + if (!env->prog->aux->ops->convert_ctx_access) + return 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + continue; + + if (insn->imm != PTR_TO_CTX) { + /* clear internal mark */ + insn->imm = 0; + continue; + } + + cnt = env->prog->aux->ops-> + convert_ctx_access(insn->dst_reg, insn->src_reg, + insn->off, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (cnt == 1) { + memcpy(insn, insn_buf, sizeof(*insn)); + continue; + } + + /* several new insns need to be inserted. Make room for them */ + insn_cnt += cnt - 1; + new_prog = bpf_prog_realloc(env->prog, + bpf_prog_size(insn_cnt), + GFP_USER); + if (!new_prog) + return -ENOMEM; + + new_prog->len = insn_cnt; + + memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, + sizeof(*insn) * (insn_cnt - i - cnt)); + + /* copy substitute insns in place of load instruction */ + memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + + /* adjust branches in the whole program */ + adjust_branches(new_prog, i, cnt - 1); + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + cnt - 1; + i += cnt - 1; + } + + return 0; +} + static void free_states(struct verifier_env *env) { struct verifier_state_list *sl, *sln; @@ -1889,13 +2021,13 @@ static void free_states(struct verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; struct verifier_env *env; int ret = -EINVAL; - if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; /* 'struct verifier_env' can be global, but since it's not small, @@ -1905,7 +2037,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; - env->prog = prog; + env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -1937,7 +2069,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - env->explored_states = kcalloc(prog->len, + env->explored_states = kcalloc(env->prog->len, sizeof(struct verifier_state_list *), GFP_USER); ret = -ENOMEM; @@ -1954,6 +2086,10 @@ skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -1969,18 +2105,18 @@ skip_full_check: if (ret == 0 && env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ - prog->aux->used_maps = kmalloc_array(env->used_map_cnt, - sizeof(env->used_maps[0]), - GFP_KERNEL); + env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); - if (!prog->aux->used_maps) { + if (!env->prog->aux->used_maps) { ret = -ENOMEM; goto free_log_buf; } - memcpy(prog->aux->used_maps, env->used_maps, + memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); - prog->aux->used_map_cnt = env->used_map_cnt; + env->prog->aux->used_map_cnt = env->used_map_cnt; /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions @@ -1992,11 +2128,12 @@ free_log_buf: if (log_level) vfree(log_buf); free_env: - if (!prog->aux->used_maps) + if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); + *prog = env->prog; kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1d1fe9361d29..fc7f4748d34a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -548,9 +548,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) - continue; - /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); @@ -873,7 +870,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cpumask_empty(new_cpus)) + if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -1129,7 +1126,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (nodes_empty(*new_mems)) + if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1979,7 +1976,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 07ce18ca71e0..0874e2edd275 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -604,7 +604,7 @@ return_normal: online_cpus) cpu_relax(); if (!time_left) - pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); + pr_crit("Timed out waiting for secondary CPUs.\n"); /* * At this point the primary processor is completely @@ -696,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (arch_kgdb_ops.enable_nmi) arch_kgdb_ops.enable_nmi(0); + /* + * Avoid entering the debugger if we were triggered due to an oops + * but panic_timeout indicates the system should automatically + * reboot on panic. We don't want to get stuck waiting for input + * on such systems, especially if its "just" an oops. + */ + if (signo != SIGTRAP && panic_timeout) + return 1; memset(ks, 0, sizeof(struct kgdb_state)); ks->cpu = raw_smp_processor_id(); @@ -828,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self, unsigned long val, void *data) { + /* + * Avoid entering the debugger if we were triggered due to a panic + * We don't want to get stuck waiting for input from user in such case. + * panic_timeout indicates the system should automatically + * reboot on panic. + */ + if (panic_timeout) + return NOTIFY_DONE; + if (dbg_kdb_mode) kdb_printf("PANIC: %s\n", (char *)data); kgdb_breakpoint(); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 7c70812caea5..fc1ef736253c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -439,7 +439,7 @@ poll_again: * substituted for %d, %x or %o in the prompt. */ -char *kdb_getstr(char *buffer, size_t bufsize, char *prompt) +char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); @@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor) return 0; } -int vkdb_printf(const char *fmt, va_list ap) +int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) { int diag; int linecount; @@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap) size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; } + if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) + /* + * This was a interactive search (using '/' at more + * prompt) and it has completed. Clear the flag. + */ + kdb_grepping_flag = 0; /* * at this point the string is a full line and * should be printed, up to the null. @@ -691,19 +697,20 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); + cp = (char *) printk_skip_level(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { - gdbstub_msg_write(kdb_buffer, retlen); + gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { if (dbg_io_ops && !dbg_io_ops->is_console) { - len = retlen; - cp = kdb_buffer; + len = retlen - (cp - kdb_buffer); + cp2 = cp; while (len--) { - dbg_io_ops->write_char(*cp); - cp++; + dbg_io_ops->write_char(*cp2); + cp2++; } } while (c) { - c->write(c, kdb_buffer, retlen); + c->write(c, cp, retlen - (cp - kdb_buffer)); touch_nmi_watchdog(); c = c->next; } @@ -711,7 +718,10 @@ kdb_printit: if (logging) { saved_loglevel = console_loglevel; console_loglevel = CONSOLE_LOGLEVEL_SILENT; - printk(KERN_INFO "%s", kdb_buffer); + if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK) + printk("%s", kdb_buffer); + else + pr_info("%s", kdb_buffer); } if (KDB_STATE(PAGER)) { @@ -794,11 +804,23 @@ kdb_printit: kdb_nextline = linecount - 1; kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ + } else if (buf1[0] == '/' && !kdb_grepping_flag) { + kdb_printf("\r"); + kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN, + kdbgetenv("SEARCHPROMPT") ?: "search> "); + *strchrnul(kdb_grep_string, '\n') = '\0'; + kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH; + suspend_grep = 1; /* for this recursion */ } else if (buf1[0] && buf1[0] != '\n') { /* user hit something other than enter */ suspend_grep = 1; /* for this recursion */ - kdb_printf("\nOnly 'q' or 'Q' are processed at more " - "prompt, input ignored\n"); + if (buf1[0] != '/') + kdb_printf( + "\nOnly 'q', 'Q' or '/' are processed at " + "more prompt, input ignored\n"); + else + kdb_printf("\n'/' cannot be used during | " + "grep filtering, input ignored\n"); } else if (kdb_grepping_flag) { /* user hit enter */ suspend_grep = 1; /* for this recursion */ @@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...) int r; va_start(ap, fmt); - r = vkdb_printf(fmt, ap); + r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap); va_end(ap); return r; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 7b40c5f07dce..4121345498e0 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -50,8 +50,7 @@ static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); -#define GREP_LEN 256 -char kdb_grep_string[GREP_LEN]; +char kdb_grep_string[KDB_GREP_STRLEN]; int kdb_grepping_flag; EXPORT_SYMBOL(kdb_grepping_flag); int kdb_grep_leading; @@ -870,7 +869,7 @@ static void parse_grep(const char *str) len = strlen(cp); if (!len) return; - if (len >= GREP_LEN) { + if (len >= KDB_GREP_STRLEN) { kdb_printf("search string too long\n"); return; } @@ -915,13 +914,12 @@ int kdb_parse(const char *cmdstr) char *cp; char *cpp, quoted; kdbtab_t *tp; - int i, escaped, ignore_errors = 0, check_grep; + int i, escaped, ignore_errors = 0, check_grep = 0; /* * First tokenize the command string. */ cp = (char *)cmdstr; - kdb_grepping_flag = check_grep = 0; if (KDB_FLAG(CMD_INTERRUPT)) { /* Previous command was interrupted, newline must not @@ -1247,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_printf("due to NonMaskable Interrupt @ " kdb_machreg_fmt "\n", instruction_pointer(regs)); - kdb_dumpregs(regs); break; case KDB_REASON_SSTEP: case KDB_REASON_BREAK: @@ -1281,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, */ kdb_nextline = 1; KDB_STATE_CLEAR(SUPPRESS); + kdb_grepping_flag = 0; + /* ensure the old search does not leak into '/' commands */ + kdb_grep_string[0] = '\0'; cmdbuf = cmd_cur; *cmdbuf = '\0'; @@ -2256,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) /* * Validate cpunum */ - if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) + if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) return KDB_BADCPUNUM; dbg_switch_cpu = cpunum; @@ -2583,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv) #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" "Buffers: %8lu kB\n", - val.totalram, val.freeram, val.bufferram); + K(val.totalram), K(val.freeram), K(val.bufferram)); return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index eaacd1693954..75014d7f4568 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -196,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t, /* Miscellaneous functions and data areas */ extern int kdb_grepping_flag; +#define KDB_GREPPING_FLAG_SEARCH 0x8000 extern char kdb_grep_string[]; +#define KDB_GREP_STRLEN 256 extern int kdb_grep_leading; extern int kdb_grep_trailing; extern char *kdb_cmds[]; @@ -209,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); -extern char *kdb_getstr(char *, size_t, char *); +extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); /* Defines for kdb_symbol_print */ diff --git a/kernel/events/core.c b/kernel/events/core.c index f04daabfd1cf..2fabc0627165 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3591,7 +3591,7 @@ static void put_event(struct perf_event *event) ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, true); - mutex_unlock(&ctx->mutex); + perf_event_ctx_unlock(event, ctx); _free_event(event); } @@ -4574,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); + int rctx; + + rctx = perf_swevent_get_recursion_context(); + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ if (event->pending_disable) { event->pending_disable = 0; @@ -4584,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry) event->pending_wakeup = 0; perf_event_wakeup(event); } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } /* diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 52aa7e8de927..752d6486b67e 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,33 +1,7 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' -# if-lt -# Usage VAR := $(call if-lt, $(a), $(b)) -# Returns 1 if (a < b) -if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) - -ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) - cc-ver := 0304 -else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) - cc-ver := 0407 -else -# Use cc-version if available, otherwise set 0 -# -# scripts/Kbuild.include, which contains cc-version function, is not included -# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" -# Meaning cc-ver is empty causing if-lt test to fail with -# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. -# This has no affect on the clean phase, but the error message could be -# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version -# is not available. We can probably move if-lt to Kbuild.include, so it's also -# not defined during clean or to include Kbuild.include in -# scripts/Makefile.clean. But the following workaround seems least invasive. - cc-ver := $(if $(call cc-version),$(call cc-version),0) -endif - -obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o - -ifeq ($(call if-lt, $(cc-ver), 0407),1) - obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o -else - obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o -endif +obj-y := base.o fs.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \ + gcc_3_4.o, gcc_4_7.o) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 196a06fbc122..886d09e691d5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1474,8 +1474,13 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). + * + * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and + * it cannot be set along with IRQF_NO_SUSPEND. */ - if ((irqflags & IRQF_SHARED) && !dev_id) + if (((irqflags & IRQF_SHARED) && !dev_id) || + (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || + ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 3ca532592704..5204a6d1b985 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -43,9 +43,12 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && - desc->no_suspend_depth != desc->nr_actions); + (desc->no_suspend_depth + + desc->cond_suspend_depth) != desc->nr_actions); } /* @@ -61,6 +64,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc, int irq) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ff7f47d026ac..3f9f1d6b4c2e 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -89,16 +89,28 @@ static bool klp_is_object_loaded(struct klp_object *obj) /* sets obj->mod if object is not vmlinux and module is found */ static void klp_find_object_module(struct klp_object *obj) { + struct module *mod; + if (!klp_is_module(obj)) return; mutex_lock(&module_mutex); /* - * We don't need to take a reference on the module here because we have - * the klp_mutex, which is also taken by the module notifier. This - * prevents any module from unloading until we release the klp_mutex. + * We do not want to block removal of patched modules and therefore + * we do not take a reference here. The patches are removed by + * a going module handler instead. + */ + mod = find_module(obj->name); + /* + * Do not mess work of the module coming and going notifiers. + * Note that the patch might still be needed before the going handler + * is called. Module functions can be called even in the GOING state + * until mod->exit() finishes. This is especially important for + * patches that modify semantic of the functions. */ - obj->mod = find_module(obj->name); + if (mod && mod->klp_alive) + obj->mod = mod; + mutex_unlock(&module_mutex); } @@ -248,11 +260,12 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, /* first, check if it's an exported symbol */ preempt_disable(); sym = find_symbol(name, NULL, NULL, true, true); - preempt_enable(); if (sym) { *addr = sym->value; + preempt_enable(); return 0; } + preempt_enable(); /* otherwise check if it's in another .o within the patch module */ return klp_find_object_symbol(pmod->name, name, addr); @@ -314,12 +327,12 @@ static void notrace klp_ftrace_handler(unsigned long ip, rcu_read_lock(); func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, stack_node); - rcu_read_unlock(); - if (WARN_ON_ONCE(!func)) - return; + goto unlock; klp_arch_set_pc(regs, (unsigned long)func->new_func); +unlock: + rcu_read_unlock(); } static int klp_disable_func(struct klp_func *func) @@ -731,7 +744,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->state = KLP_DISABLED; return kobject_init_and_add(&func->kobj, &klp_ktype_func, - obj->kobj, func->old_name); + obj->kobj, "%s", func->old_name); } /* parts of the initialization that is done only when the object is loaded */ @@ -766,6 +779,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) return -EINVAL; obj->state = KLP_DISABLED; + obj->mod = NULL; klp_find_object_module(obj); @@ -807,7 +821,7 @@ static int klp_init_patch(struct klp_patch *patch) patch->state = KLP_DISABLED; ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, - klp_root_kobj, patch->mod->name); + klp_root_kobj, "%s", patch->mod->name); if (ret) goto unlock; @@ -960,6 +974,15 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action, mutex_lock(&klp_mutex); + /* + * Each module has to know that the notifier has been called. + * We never know what module will get patched by a new patch. + */ + if (action == MODULE_STATE_COMING) + mod->klp_alive = true; + else /* MODULE_STATE_GOING */ + mod->klp_alive = false; + list_for_each_entry(patch, &klp_patches, list) { for (obj = patch->objs; obj->funcs; obj++) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..ba77ab5f64dd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class) if (!new_class->name) return 0; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) @@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) hash_head = classhashentry(key); /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: + * We do an RCU walk of the hash, see lockdep_free_key_range(). */ - list_for_each_entry(class, hash_head, hash_entry) { + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return NULL; + + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; - unsigned long flags; + + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); if (likely(class)) @@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) key = lock->key->subkeys + subclass; hash_head = classhashentry(key); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } /* * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry(class, hash_head, hash_entry) + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; + } + /* * Allocate a new key from the static array, and add it to * the hash: */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); return NULL; } - raw_local_irq_restore(flags); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); @@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - raw_local_irq_restore(flags); printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) @@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("\n"); dump_stack(); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } } out_unlock_set: graph_unlock(); - raw_local_irq_restore(flags); out_set_class_cache: if (!subclass || force) @@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, entry->distance = distance; entry->trace = *trace; /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: + * Both allocation and removal are done under the graph lock; but + * iteration is under RCU-sched; see look_up_lock_class() and + * lockdep_free_key_range(). */ list_add_tail_rcu(&entry->entry, head); @@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry, else head = &lock->class->locks_before; - list_for_each_entry(entry, head, entry) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; mark_lock_accessed(entry, lock); @@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry(chain, hash_head, entry) { + list_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (unlikely(!debug_locks)) return; - if (subclass) + if (subclass) { + unsigned long flags; + + if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; register_lock_class(lock, subclass, 1); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + } } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one sync_sched() before getting here, so we're guaranteed + * nobody will look up these exact classes -- they're properly dead but still + * allocated. + */ void lockdep_free_key_range(void *start, unsigned long size) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i; @@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size) if (locked) graph_unlock(); raw_local_irq_restore(flags); + + /* + * Wait for any possible iterators from look_up_lock_class() to pass + * before continuing to free the memory they refer to. + * + * sync_sched() is sufficient because the read-side is IRQ disable. + */ + synchronize_sched(); + + /* + * XXX at this point we could return the resources to the pool; + * instead we leak them. We would need to change to bitmap allocators + * instead of the linear allocators we have now. + */ } void lockdep_reset_lock(struct lockdep_map *lock) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i, j; @@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3059bc2f022d..6357265a31ad 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1193,7 +1193,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); if (unlikely(ret)) { - remove_waiter(lock, &waiter); + __set_current_state(TASK_RUNNING); + if (rt_mutex_has_waiters(lock)) + remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); } diff --git a/kernel/module.c b/kernel/module.c index b34813f725e9..99fdf94efce8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,7 +56,6 @@ #include <linux/async.h> #include <linux/percpu.h> #include <linux/kmemleak.h> -#include <linux/kasan.h> #include <linux/jump_label.h> #include <linux/pfn.h> #include <linux/bsearch.h> @@ -1814,7 +1813,6 @@ static void unset_module_init_ro_nx(struct module *mod) { } void __weak module_memfree(void *module_region) { vfree(module_region); - kasan_module_free(module_region); } void __weak module_arch_cleanup(struct module *mod) @@ -1867,7 +1865,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - /* Free lock-classes: */ + /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ @@ -2313,11 +2311,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_size += strtab_size; + mod->core_size = debug_align(mod->core_size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; + mod->init_size = debug_align(mod->init_size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } @@ -3349,9 +3349,6 @@ static int load_module(struct load_info *info, const char __user *uargs, module_bug_cleanup(mod); mutex_unlock(&module_mutex); - /* Free lock-classes: */ - lockdep_free_key_range(mod->module_core, mod->core_size); - /* we can't deallocate the module until we clear memory protection */ unset_module_init_ro_nx(mod); unset_module_core_ro_nx(mod); @@ -3375,6 +3372,9 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_rcu(); mutex_unlock(&module_mutex); free_module: + /* Free lock-classes; relies on the preceding sync_rcu() */ + lockdep_free_key_range(mod->module_core, mod->core_size); + module_deallocate(mod, info); free_copy: free_copy(info); diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index cbd69d842341..2ca4a8b5fe57 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -3,7 +3,7 @@ struct console_cmdline { - char name[8]; /* Name of the driver */ + char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 40d50cc4c686..879edfc5ee52 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1811,7 +1811,7 @@ int vprintk_default(const char *fmt, va_list args) #ifdef CONFIG_KGDB_KDB if (unlikely(kdb_trap_printk)) { - r = vkdb_printf(fmt, args); + r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); return r; } #endif @@ -2464,6 +2464,7 @@ void register_console(struct console *newcon) for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0d7bbe3095ad..0a571e9a0f1d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -326,6 +326,7 @@ void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special.b.need_qs) { rcu_preempt_qs(); + t->rcu_read_unlock_special.b.need_qs = false; if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 8a2e230fb86a..eae160dd669d 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void) * so we don't have to move tasks around upon policy change, * or flail around trying to allocate bandwidth on the fly. * A bandwidth exception in __sched_setscheduler() allows - * the policy change to proceed. Thereafter, task_group() - * returns &root_task_group, so zero bandwidth is required. + * the policy change to proceed. */ free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; @@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) if (tg != &root_task_group) return false; - if (p->sched_class != &fair_sched_class) - return false; - /* * We can only assume the task group can't go away on us if * autogroup_move_group() can see us on ->thread_group list. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 7052d3fd4e7b..8d0f35debf35 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x) * first without taking the lock so we can * return early in the blocking case. */ - if (!ACCESS_ONCE(x->done)) + if (!READ_ONCE(x->done)) return 0; spin_lock_irqsave(&x->wait.lock, flags); @@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { - return !!ACCESS_ONCE(x->done); + if (!READ_ONCE(x->done)) + return false; + + /* + * If ->done, we need to wait for complete() to release ->wait.lock + * otherwise we can end up freeing the completion before complete() + * is done referencing it. + * + * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders + * the loads of ->done and ->wait.lock such that we cannot observe + * the lock before complete() acquires it while observing the ->done + * after it's acquired the lock. + */ + smp_rmb(); + spin_unlock_wait(&x->wait.lock); + return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13049aac05a6..62671f53202a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -307,66 +307,6 @@ __read_mostly int scheduler_running; int sysctl_sched_rt_runtime = 950000; /* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) - return rq; - raw_spin_unlock(&rq->lock); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} - -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) - return rq; - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) - __releases(rq->lock) - __releases(p->pi_lock) -{ - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -} - -/* * this_rq_lock - lock this runqueue and disable interrupts. */ static struct rq *this_rq_lock(void) @@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -static void preempt_schedule_common(void) +static void __sched notrace preempt_schedule_common(void) { do { __preempt_count_add(PREEMPT_ACTIVE); @@ -3094,6 +3034,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } else { if (dl_prio(oldprio)) p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; p->sched_class = &fair_sched_class; } @@ -4418,36 +4360,29 @@ EXPORT_SYMBOL_GPL(yield_to); * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ -void __sched io_schedule(void) -{ - struct rq *rq = raw_rq(); - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; - schedule(); - current->in_iowait = 0; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); -} -EXPORT_SYMBOL(io_schedule); - long __sched io_schedule_timeout(long timeout) { - struct rq *rq = raw_rq(); + int old_iowait = current->in_iowait; + struct rq *rq; long ret; + current->in_iowait = 1; + if (old_iowait) + blk_schedule_flush_plug(current); + else + blk_flush_plug(current); + delayacct_blkio_start(); + rq = raw_rq(); atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; ret = schedule_timeout(timeout); - current->in_iowait = 0; + current->in_iowait = old_iowait; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); + return ret; } +EXPORT_SYMBOL(io_schedule_timeout); /** * sys_sched_get_priority_max - return maximum RT priority. @@ -7642,6 +7577,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + for_each_process_thread(g, p) { if (rt_task(p) && task_group(p) == tg) return 1; @@ -7734,6 +7675,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg, { int i, err = 0; + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); @@ -7790,9 +7742,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) rt_period = (u64)rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; - if (rt_period == 0) - return -EINVAL; - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a027799ae130..3fa8fa6d9403 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); + unsigned long flags; struct rq *rq; -again: - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (rq != task_rq(p)) { - /* Task was moved, retrying. */ - raw_spin_unlock(&rq->lock); - goto again; - } + rq = task_rq_lock(current, &flags); /* * We need to take care of several possible races here: @@ -541,6 +535,26 @@ again: sched_clock_tick(); update_rq_clock(rq); + + /* + * If the throttle happened during sched-out; like: + * + * schedule() + * deactivate_task() + * dequeue_task_dl() + * update_curr_dl() + * start_dl_timer() + * __dequeue_task_dl() + * prev->on_rq = 0; + * + * We can be both throttled and !queued. Replenish the counter + * but do not enqueue -- wait for our wakeup to do that. + */ + if (!task_on_rq_queued(p)) { + replenish_dl_entity(dl_se, dl_se); + goto unlock; + } + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); @@ -555,7 +569,7 @@ again: push_dl_task(rq); #endif unlock: - raw_spin_unlock(&rq->lock); + task_rq_unlock(rq, current, &flags); return HRTIMER_NORESTART; } @@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq) rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } + update_rq_clock(rq); update_curr_dl(rq); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..bcfe32088b37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower */ - if (local + shared == 0) { + if (local + shared == 0 || p->numa_faults_locality[2]) { p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period << 1); @@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 94b2d7b88a27..80014a178342 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -82,6 +82,7 @@ static void cpuidle_idle_call(void) struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; unsigned int broadcast; + bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -105,6 +106,9 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + if (cpuidle_not_available(drv, dev)) + goto use_default; + /* * Suspend-to-idle ("freeze") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only @@ -115,30 +119,24 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ if (idle_should_freeze()) { - cpuidle_enter_freeze(); - local_irq_enable(); - goto exit_idle; - } + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state >= 0) { + local_irq_enable(); + goto exit_idle; + } - /* - * Ask the cpuidle framework to choose a convenient idle state. - * Fall back to the default arch idle method on errors. - */ - next_state = cpuidle_select(drv, dev); - if (next_state < 0) { -use_default: + reflect = false; + next_state = cpuidle_find_deepest_state(drv, dev); + } else { + reflect = true; /* - * We can't use the cpuidle framework, let's use the default - * idle routine. + * Ask the cpuidle framework to choose a convenient idle state. */ - if (current_clr_polling_and_test()) - local_irq_enable(); - else - arch_cpu_idle(); - - goto exit_idle; + next_state = cpuidle_select(drv, dev); } - + /* Fall back to the default arch idle method on errors. */ + if (next_state < 0) + goto use_default; /* * The idle task must be scheduled, it is pointless to @@ -183,7 +181,8 @@ use_default: /* * Give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); + if (reflect) + cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); @@ -196,6 +195,19 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); + return; + +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0870db23d79c..dc0f435a2779 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { } extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +static inline void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sys.c b/kernel/sys.c index ea9c88109894..a03d9cd23ed7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -97,6 +97,12 @@ #ifndef MPX_DISABLE_MANAGEMENT # define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) #endif +#ifndef GET_FP_MODE +# define GET_FP_MODE(a) (-EINVAL) +#endif +#ifndef SET_FP_MODE +# define SET_FP_MODE(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -1102,6 +1108,7 @@ DECLARE_RWSEM(uts_sem); /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60. */ static int override_release(char __user *release, size_t len) { @@ -1121,7 +1128,7 @@ static int override_release(char __user *release, size_t len) break; rest++; } - v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); @@ -2219,6 +2226,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = MPX_DISABLE_MANAGEMENT(me); break; + case PR_SET_FP_MODE: + error = SET_FP_MODE(me, arg2); + break; + case PR_GET_FP_MODE: + error = GET_FP_MODE(me); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 83d907afb4a6..4012336de30f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1229,6 +1229,14 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = &zero, + }, + { .procname = "nr_pdflush_threads", .mode = 0444 /* read-only */, .proc_handler = pdflush_proc_obsolete, diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4b585e0fdd22..0f60b08a4f07 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -633,10 +633,14 @@ int ntp_validate_timex(struct timex *txc) if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) return -EPERM; - if (txc->modes & ADJ_FREQUENCY) { - if (LONG_MIN / PPM_SCALE > txc->freq) + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; - if (LONG_MAX / PPM_SCALE < txc->freq) + if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index eb682d5c697c..6aac4beedbbe 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode, */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { + int bc_moved; /* * We try to cancel the timer first. If the callback is on * flight on some other cpu then we let it handle it. If we @@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * restart the timer because we are in the callback, but we * can set the expiry time and let the callback return * HRTIMER_RESTART. + * + * Since we are in the idle loop at this point and because + * hrtimer_{start/cancel} functions call into tracing, + * calls to these functions must be bound within RCU_NONIDLE. */ - if (hrtimer_try_to_cancel(&bctimer) >= 0) { - hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? + !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : + 0); + if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); } else if (bc->bound_on == smp_processor_id()) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 45e5cb143d17..4f228024055b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1059,6 +1059,12 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_graph_active; +#else +# define ftrace_graph_active 0 +#endif + #ifdef CONFIG_DYNAMIC_FTRACE static struct ftrace_ops *removed_ops; @@ -2041,8 +2047,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (!ftrace_rec_count(rec)) rec->flags = 0; else - /* Just disable the record (keep REGS state) */ - rec->flags &= ~FTRACE_FL_ENABLED; + /* + * Just disable the record, but keep the ops TRAMP + * and REGS states. The _EN flags must be disabled though. + */ + rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | + FTRACE_FL_REGS_EN); } return FTRACE_UPDATE_MAKE_NOP; @@ -2688,24 +2698,36 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) static void ftrace_startup_sysctl(void) { + int command; + if (unlikely(ftrace_disabled)) return; /* Force update next time */ saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } } static void ftrace_shutdown_sysctl(void) { + int command; + if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_DISABLE_CALLS); + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } } static cycle_t ftrace_update_time; @@ -5558,12 +5580,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { - ftrace_startup_sysctl(); - /* we are starting ftrace again */ if (ftrace_ops_list != &ftrace_list_end) update_ftrace_function(); + ftrace_startup_sysctl(); + } else { /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; @@ -5590,8 +5612,6 @@ static struct ftrace_ops graph_ops = { ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; -static int ftrace_graph_active; - int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f28849394791..41ff75b478c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2728,19 +2728,57 @@ bool flush_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work); +struct cwt_wait { + wait_queue_t wait; + struct work_struct *work; +}; + +static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); + + if (cwait->work != key) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); /* - * If someone else is canceling, wait for the same event it - * would be waiting for before retrying. + * If someone else is already canceling, wait for it to + * finish. flush_work() doesn't work for PREEMPT_NONE + * because we may get scheduled between @work's completion + * and the other canceling task resuming and clearing + * CANCELING - flush_work() will return false immediately + * as @work is no longer busy, try_to_grab_pending() will + * return -ENOENT as @work is still being canceled and the + * other canceling task won't be able to clear CANCELING as + * we're hogging the CPU. + * + * Let's wait for completion using a waitqueue. As this + * may lead to the thundering herd problem, use a custom + * wake function which matches @work along with exclusive + * wait and wakeup. */ - if (unlikely(ret == -ENOENT)) - flush_work(work); + if (unlikely(ret == -ENOENT)) { + struct cwt_wait cwait; + + init_wait(&cwait.wait); + cwait.wait.func = cwt_wakefn; + cwait.work = work; + + prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, + TASK_UNINTERRUPTIBLE); + if (work_is_canceling(work)) + schedule(); + finish_wait(&cancel_waitq, &cwait.wait); + } } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ @@ -2749,6 +2787,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) flush_work(work); clear_work_data(work); + + /* + * Paired with prepare_to_wait() above so that either + * waitqueue_active() is visible here or !work_is_canceling() is + * visible there. + */ + smp_mb(); + if (waitqueue_active(&cancel_waitq)) + __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); + return ret; } |