diff options
| author | Wolfram Sang <wsa@the-dreams.de> | 2018-01-03 22:50:51 +0100 |
|---|---|---|
| committer | Wolfram Sang <wsa@the-dreams.de> | 2018-01-03 22:50:51 +0100 |
| commit | fddfa22a4403cd19548de075ddada0c7c966a232 (patch) | |
| tree | f981a63ff0c1062aecb5ab48b0585ffe97a7c20b /kernel | |
| parent | 639136d2a70ab9837befb22ad5b3d67cb4db2216 (diff) | |
| parent | 0f30aca72c3b68f4b6a443193b574f14106cd61e (diff) | |
Merge tag 'at24-4.16-updates-for-wolfram' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux into i2c/for-4.16
"AT24 updates for 4.16 merge window
The driver has been converted to using regmap instead of raw i2c and
smbus calls which shrank the code significantly.
Device tree binding document has been cleaned up. Device tree support in
the driver has been improved and we now support all at24 models as well
as two new DT properties (no-read-rollover and wp-gpios).
We no longer user unreadable magic values for driver data as the way it
was implemented caused problems for some EEPROM models - we switched to
regular structs.
Aside from that, there's a bunch of coding style fixes and minor
improvements all over the place."
Diffstat (limited to 'kernel')
40 files changed, 579 insertions, 933 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b9f8686a84cf..86b50aa26ee8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1447,7 +1447,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) - cnt++; + if (*prog != &dummy_bpf_prog.prog) + cnt++; rcu_read_unlock(); return cnt; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e469e05c8e83..3905d4bc5b80 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab) pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); + cond_resched(); } free_elems: bpf_map_area_free(htab->elems); @@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); + cond_resched(); } skip_percpu_elems: diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 68ec884440b7..8455b89d1bbf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree. + * + * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE + * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME + * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + */ + #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/bug.h> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4593571c404..04b24876cd23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1059,6 +1059,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; + /* The stack spill tracking logic in check_stack_write() + * and check_stack_read() relies on stack accesses being + * aligned. + */ + strict = true; break; default: break; @@ -1067,6 +1072,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +/* truncate register to smaller size (in bytes) + * must be called with size < BPF_REG_SIZE + */ +static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) +{ + u64 mask; + + /* clear high bits in bit representation */ + reg->var_off = tnum_cast(reg->var_off, size); + + /* fix arithmetic bounds */ + mask = ((u64)1 << (size * 8)) - 1; + if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { + reg->umin_value &= mask; + reg->umax_value &= mask; + } else { + reg->umin_value = 0; + reg->umax_value = mask; + } + reg->smin_value = reg->umin_value; + reg->smax_value = reg->umax_value; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -1200,9 +1228,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - regs[value_regno].var_off = - tnum_cast(regs[value_regno].var_off, size); - __update_reg_bounds(®s[value_regno]); + coerce_reg_to_size(®s[value_regno], size); } return err; } @@ -1282,6 +1308,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); + return -EACCES; } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -1674,7 +1701,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); + if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { + verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", + func_id_name(func_id), func_id); + return -EINVAL; + } memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; @@ -1766,14 +1799,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return 0; } -static void coerce_reg_to_32(struct bpf_reg_state *reg) -{ - /* clear high 32 bits */ - reg->var_off = tnum_cast(reg->var_off, 4); - /* Update bounds */ - __update_reg_bounds(reg); -} - static bool signed_add_overflows(s64 a, s64 b) { /* Do the add in u64, where overflow is well-defined */ @@ -1794,6 +1819,41 @@ static bool signed_sub_overflows(s64 a, s64 b) return res > a; } +static bool check_reg_sane_offset(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + enum bpf_reg_type type) +{ + bool known = tnum_is_const(reg->var_off); + s64 val = reg->var_off.value; + s64 smin = reg->smin_value; + + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "math between %s pointer and %lld is not allowed\n", + reg_type_str[type], val); + return false; + } + + if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { + verbose(env, "%s pointer offset %d is not allowed\n", + reg_type_str[type], reg->off); + return false; + } + + if (smin == S64_MIN) { + verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", + reg_type_str[type]); + return false; + } + + if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { + verbose(env, "value %lld makes %s pointer be out of bounds\n", + smin, reg_type_str[type]); + return false; + } + + return true; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -1830,29 +1890,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ - if (!env->allow_ptr_leaks) - verbose(env, - "R%d 32-bit pointer arithmetic prohibited\n", - dst); + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", + dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", - dst); + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", - dst); + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", - dst); + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + dst); return -EACCES; } @@ -1862,6 +1918,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->type = ptr_reg->type; dst_reg->id = ptr_reg->id; + if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || + !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) + return -EINVAL; + switch (opcode) { case BPF_ADD: /* We can take a fixed offset as long as it doesn't overflow @@ -1915,9 +1975,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_SUB: if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d tried to subtract pointer from scalar\n", - dst); + verbose(env, "R%d tried to subtract pointer from scalar\n", + dst); return -EACCES; } /* We don't allow subtraction from FP, because (according to @@ -1925,9 +1984,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * be able to deal with it. */ if (ptr_reg->type == PTR_TO_STACK) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d subtraction from stack pointer prohibited\n", - dst); + verbose(env, "R%d subtraction from stack pointer prohibited\n", + dst); return -EACCES; } if (known && (ptr_reg->off - smin_val == @@ -1976,28 +2034,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_AND: case BPF_OR: case BPF_XOR: - /* bitwise ops on pointers are troublesome, prohibit for now. - * (However, in principle we could allow some cases, e.g. - * ptr &= ~3 which would reduce min_value by 3.) - */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d bitwise operator %s on pointer prohibited\n", - dst, bpf_alu_string[opcode >> 4]); + /* bitwise ops on pointers are troublesome, prohibit. */ + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", + dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", - dst, bpf_alu_string[opcode >> 4]); + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", + dst, bpf_alu_string[opcode >> 4]); return -EACCES; } + if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) + return -EINVAL; + __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; } +/* WARNING: This function does calculations on 64-bit values, but the actual + * execution may occur on 32-bit values. Therefore, things like bitshifts + * need extra checks in the 32-bit case. + */ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *dst_reg, @@ -2008,12 +2068,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, bool src_known, dst_known; s64 smin_val, smax_val; u64 umin_val, umax_val; + u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; - if (BPF_CLASS(insn->code) != BPF_ALU64) { - /* 32-bit ALU ops are (32,32)->64 */ - coerce_reg_to_32(dst_reg); - coerce_reg_to_32(&src_reg); - } smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -2021,6 +2077,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); + if (!src_known && + opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { + __mark_reg_unknown(dst_reg); + return 0; + } + switch (opcode) { case BPF_ADD: if (signed_add_overflows(dst_reg->smin_value, smin_val) || @@ -2149,9 +2211,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; case BPF_LSH: - if (umax_val > 63) { - /* Shifts greater than 63 are undefined. This includes - * shifts by a negative number. + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. */ mark_reg_unknown(env, regs, insn->dst_reg); break; @@ -2177,27 +2239,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; case BPF_RSH: - if (umax_val > 63) { - /* Shifts greater than 63 are undefined. This includes - * shifts by a negative number. + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. */ mark_reg_unknown(env, regs, insn->dst_reg); break; } - /* BPF_RSH is an unsigned shift, so make the appropriate casts */ - if (dst_reg->smin_value < 0) { - if (umin_val) { - /* Sign bit will be cleared */ - dst_reg->smin_value = 0; - } else { - /* Lost sign bit information */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - } else { - dst_reg->smin_value = - (u64)(dst_reg->smin_value) >> umax_val; - } + /* BPF_RSH is an unsigned shift. If the value in dst_reg might + * be negative, then either: + * 1) src_reg might be zero, so the sign bit of the result is + * unknown, so we lose our signed bounds + * 2) it's known negative, thus the unsigned bounds capture the + * signed bounds + * 3) the signed bounds cross zero, so they tell us nothing + * about the result + * If the value in dst_reg is known nonnegative, then again the + * unsigned bounts capture the signed bounds. + * Thus, in all cases it suffices to blow away our signed bounds + * and rely on inferring new ones from the unsigned bounds and + * var_off of the result. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; if (src_known) dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); @@ -2213,6 +2277,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; } + if (BPF_CLASS(insn->code) != BPF_ALU64) { + /* 32-bit ALU ops are (32,32)->32 */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; @@ -2227,7 +2297,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); - int rc; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -2238,43 +2307,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (src_reg->type != SCALAR_VALUE) { if (dst_reg->type != SCALAR_VALUE) { /* Combining two pointers by any ALU op yields - * an arbitrary scalar. + * an arbitrary scalar. Disallow all math except + * pointer subtraction */ - if (!env->allow_ptr_leaks) { - verbose(env, "R%d pointer %s pointer prohibited\n", - insn->dst_reg, - bpf_alu_string[opcode >> 4]); - return -EACCES; + if (opcode == BPF_SUB){ + mark_reg_unknown(env, regs, insn->dst_reg); + return 0; } - mark_reg_unknown(env, regs, insn->dst_reg); - return 0; + verbose(env, "R%d pointer %s pointer prohibited\n", + insn->dst_reg, + bpf_alu_string[opcode >> 4]); + return -EACCES; } else { /* scalar += pointer * This is legal, but we have to reverse our * src/dest handling in computing the range */ - rc = adjust_ptr_min_max_vals(env, insn, - src_reg, dst_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* scalar += unknown scalar */ - __mark_reg_unknown(&off_reg); - return adjust_scalar_min_max_vals( - env, insn, - dst_reg, off_reg); - } - return rc; + return adjust_ptr_min_max_vals(env, insn, + src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ - rc = adjust_ptr_min_max_vals(env, insn, - dst_reg, src_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* unknown scalar += scalar */ - __mark_reg_unknown(dst_reg); - return adjust_scalar_min_max_vals( - env, insn, dst_reg, *src_reg); - } - return rc; + return adjust_ptr_min_max_vals(env, insn, + dst_reg, src_reg); } } else { /* Pretend the src is a reg with a known value, since we only @@ -2283,17 +2338,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, off_reg.type = SCALAR_VALUE; __mark_reg_known(&off_reg, insn->imm); src_reg = &off_reg; - if (ptr_reg) { /* pointer += K */ - rc = adjust_ptr_min_max_vals(env, insn, - ptr_reg, src_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* unknown scalar += K */ - __mark_reg_unknown(dst_reg); - return adjust_scalar_min_max_vals( - env, insn, dst_reg, off_reg); - } - return rc; - } + if (ptr_reg) /* pointer += K */ + return adjust_ptr_min_max_vals(env, insn, + ptr_reg, src_reg); } /* Got here implies adding two SCALAR_VALUEs */ @@ -2390,17 +2437,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } mark_reg_unknown(env, regs, insn->dst_reg); - /* high 32 bits are known zero. */ - regs[insn->dst_reg].var_off = tnum_cast( - regs[insn->dst_reg].var_off, 4); - __update_reg_bounds(®s[insn->dst_reg]); + coerce_reg_to_size(®s[insn->dst_reg], 4); } } else { /* case: R = imm * remember the value we stored into this reg */ regs[insn->dst_reg].type = SCALAR_VALUE; - __mark_reg_known(regs + insn->dst_reg, insn->imm); + if (BPF_CLASS(insn->code) == BPF_ALU64) { + __mark_reg_known(regs + insn->dst_reg, + insn->imm); + } else { + __mark_reg_known(regs + insn->dst_reg, + (u32)insn->imm); + } } } else if (opcode > BPF_END) { @@ -3431,15 +3481,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); } else { - /* if we knew anything about the old value, we're not - * equal, because we can't know anything about the - * scalar value of the pointer in the new value. + /* We're trying to use a pointer in place of a scalar. + * Even if the scalar was unbounded, this could lead to + * pointer leaks because scalars are allowed to leak + * while pointers are not. We could make this safe in + * special cases if root is calling us, but it's + * probably not worth the hassle. */ - return rold->umin_value == 0 && - rold->umax_value == U64_MAX && - rold->smin_value == S64_MIN && - rold->smax_value == S64_MAX && - tnum_is_unknown(rold->var_off); + return false; } case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f780d8f6a9d..9caeda610249 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); if (refcnt > cset->nr_tasks) @@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 133b465691d6..1e111dd455c4 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp) } /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) - cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } prev_cputime_init(&cgrp->stat.prev_cputime); diff --git a/kernel/cpu.c b/kernel/cpu.c index 04892a82f6ac..53f7dc65f9a3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); -static void inline cpuhp_lock_acquire(bool bringup) +static inline void cpuhp_lock_acquire(bool bringup) { lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } -static void inline cpuhp_lock_release(bool bringup) +static inline void cpuhp_lock_release(bool bringup) { lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } #else -static void inline cpuhp_lock_acquire(bool bringup) { } -static void inline cpuhp_lock_release(bool bringup) { } +static inline void cpuhp_lock_acquire(bool bringup) { } +static inline void cpuhp_lock_release(bool bringup) { } #endif @@ -780,8 +780,8 @@ static int takedown_cpu(unsigned int cpu) BUG_ON(cpu_online(cpu)); /* - * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all - * runnable tasks from the cpu, there's only the idle task left now + * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed + * all runnable tasks from the CPU, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * * Wait for the stop thread to go away. @@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = { * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ - [CPUHP_TIMERS_DEAD] = { + [CPUHP_TIMERS_PREPARE] = { .name = "timers:dead", - .startup.single = NULL, + .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ @@ -1289,11 +1289,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - [CPUHP_AP_SMPCFD_DYING] = { - .name = "smpcfd:dying", - .startup.single = NULL, - .teardown.single = smpcfd_dying_cpu, - }, /* * Handled on controll processor until the plugged processor manages * this itself. @@ -1335,6 +1330,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, + [CPUHP_AP_SMPCFD_DYING] = { + .name = "smpcfd:dying", + .startup.single = NULL, + .teardown.single = smpcfd_dying_cpu, + }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index e74be38245ad..ed5d34925ad0 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -350,7 +350,7 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - if (kallsyms_symbol_next(p_tmp, i) < 0) + if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) break; kdb_printf("%s ", p_tmp); *(p_tmp + len) = '\0'; diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..4df5b695bf0d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6639,6 +6639,7 @@ static void perf_event_namespaces_output(struct perf_event *event, struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; + u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) @@ -6649,7 +6650,7 @@ static void perf_event_namespaces_output(struct perf_event *event, ret = perf_output_begin(&handle, event, namespaces_event->event_id.header.size); if (ret) - return; + goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); @@ -6661,6 +6662,8 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); +out: + namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, @@ -7987,11 +7990,11 @@ static void bpf_overflow_handler(struct perf_event *event, { struct bpf_perf_event_data_kern ctx = { .data = data, - .regs = regs, .event = event, }; int ret = 0; + ctx.regs = perf_arch_bpf_user_pt_regs(regs); preempt_disable(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; diff --git a/kernel/exit.c b/kernel/exit.c index 6b4298a41167..df0c91d5606c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1755,3 +1755,11 @@ Efault: return -EFAULT; } #endif + +__weak void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} diff --git a/kernel/fork.c b/kernel/fork.c index 432eadf6b58c..2295fc69717f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; } /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; + retval = arch_dup_mmap(oldmm, mm); out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); diff --git a/kernel/futex.c b/kernel/futex.c index 76ed5921117a..57d0b3657e16 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1582,8 +1582,8 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; - int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); - int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); + int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 17f05ef8f575..e4d3819a91cc 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -12,6 +12,11 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5); + + if (!__ratelimit(&ratelimit)) + return; + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); printk("->handle_irq(): %p, ", desc->handle_irq); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 7f608ac39653..acfaaef8672a 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), + BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c26c5bb6b491..508c03dfef25 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); /* - * Separate lockdep class for interrupt chip which can nest irq_desc - * lock. + * Separate lockdep classes for interrupt chip which can nest irq_desc + * lock and request mutex. */ static struct lock_class_key irq_nested_lock_class; +static struct lock_class_key irq_nested_request_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain @@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, set_bit(idx, &gc->installed); if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(virq, &irq_nested_lock_class); + irq_set_lockdep_class(virq, &irq_nested_lock_class, + &irq_nested_request_class); if (chip->irq_calc_mask) chip->irq_calc_mask(data); @@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, continue; if (flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(i, &irq_nested_lock_class); + irq_set_lockdep_class(i, &irq_nested_lock_class, + &irq_nested_request_class); if (!(flags & IRQ_GC_NO_MASK)) { struct irq_data *d = irq_get_irq_data(i); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 07d08ca701ec..ab19371eab9b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline int irq_domain_activate_irq(struct irq_data *data, bool early) +static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve) { irqd_set_activated(data); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4f4f60015e8a..62068ad46930 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) } } -static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) +static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) { int ret = 0; @@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) if (irqd->parent_data) ret = __irq_domain_activate_irq(irqd->parent_data, - early); + reserve); if (!ret && domain->ops->activate) { - ret = domain->ops->activate(domain, irqd, early); + ret = domain->ops->activate(domain, irqd, reserve); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); @@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt - * @irq_data: outermost irq_data associated with interrupt + * @irq_data: Outermost irq_data associated with interrupt + * @reserve: If set only reserve an interrupt vector instead of assigning one * * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -int irq_domain_activate_irq(struct irq_data *irq_data, bool early) +int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) { int ret = 0; if (!irqd_is_activated(irq_data)) - ret = __irq_domain_activate_irq(irq_data, early); + ret = __irq_domain_activate_irq(irq_data, reserve); if (!ret) irqd_set_activated(irq_data); return ret; diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 7df2480005f8..0ba0dd8863a7 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -384,7 +384,9 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) { struct cpumap *cm = this_cpu_ptr(m->maps); - return (m->global_available - cpudown) ? cm->available : 0; + if (!cpudown) + return m->global_available; + return m->global_available - cm->available; } /** diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index edb987b2c58d..2f3c4f5382cc 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, return ret; } +/* + * Carefully check whether the device can use reservation mode. If + * reservation mode is enabled then the early activation will assign a + * dummy vector to the device. If the PCI/MSI device does not support + * masking of the entry then this can result in spurious interrupts when + * the device driver is not absolutely careful. But even then a malfunction + * of the hardware could result in a spurious interrupt on the dummy vector + * and render the device unusable. If the entry can be masked then the core + * logic will prevent the spurious interrupt and reservation mode can be + * used. For now reservation mode is restricted to PCI/MSI. + */ +static bool msi_check_reservation_mode(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) +{ + struct msi_desc *desc; + + if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + return false; + + if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) + return false; + + if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + return false; + + /* + * Checking the first MSI descriptor is sufficient. MSIX supports + * masking and MSI does so when the maskbit is set. + */ + desc = first_msi_entry(dev); + return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; +} + /** * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - msi_alloc_info_t arg; + struct irq_data *irq_data; struct msi_desc *desc; + msi_alloc_info_t arg; int i, ret, virq; + bool can_reserve; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, if (ops->msi_finish) ops->msi_finish(&arg, 0); + can_reserve = msi_check_reservation_mode(domain, info, dev); + for_each_msi_entry(desc, dev) { virq = desc->irq; if (desc->nvec_used == 1) @@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, * the MSI entries before the PCI layer enables MSI in the * card. Otherwise the card latches a random msi message. */ - if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { - struct irq_data *irq_data; + if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) + continue; + irq_data = irq_domain_get_irq_data(domain, desc->irq); + if (!can_reserve) + irqd_clr_can_reserve(irq_data); + ret = irq_domain_activate_irq(irq_data, can_reserve); + if (ret) + goto cleanup; + } + + /* + * If these interrupts use reservation mode, clear the activated bit + * so request_irq() will assign the final vector. + */ + if (can_reserve) { + for_each_msi_entry(desc, dev) { irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data, true); - if (ret) - goto cleanup; - if (info->flags & MSI_FLAG_MUST_REACTIVATE) - irqd_clr_activated(irq_data); + irqd_clr_activated(irq_data); } } return 0; diff --git a/kernel/kcov.c b/kernel/kcov.c index 15f33faf4013..7594c033d98a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); -void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); } @@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); -void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9776da8db180..5fa1324a4f29 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/lock.h> -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#include <linux/slab.h> -#endif - #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -75,19 +71,6 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif -#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK -static int crossrelease_fullstack = 1; -#else -static int crossrelease_fullstack; -#endif -static int __init allow_crossrelease_fullstack(char *str) -{ - crossrelease_fullstack = 1; - return 0; -} - -early_param("crossrelease_fullstack", allow_crossrelease_fullstack); - /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -740,18 +723,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -static void cross_init(struct lockdep_map *lock, int cross); -static int cross_lock(struct lockdep_map *lock); -static int lock_acquire_crosslock(struct held_lock *hlock); -static int lock_release_crosslock(struct lockdep_map *lock); -#else -static inline void cross_init(struct lockdep_map *lock, int cross) {} -static inline int cross_lock(struct lockdep_map *lock) { return 0; } -static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } -static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } -#endif - /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1151,41 +1122,22 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - if (cross_lock(tgt->instance)) { - printk(" Possible unsafe locking scenario by crosslock:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk(" unlock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } else { - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); } /* @@ -1211,10 +1163,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, curr->comm, task_pid_nr(curr)); print_lock(check_src); - if (cross_lock(check_tgt->instance)) - pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); - else - pr_warn("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); @@ -1244,9 +1193,7 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (cross_lock(check_tgt->instance)) - this->trace = *trace; - else if (!save_trace(&this->trace)) + if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1850,9 +1797,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - if (cross_lock(prev->instance)) - continue; - return print_deadlock_bug(curr, prev, next); } return 1; @@ -2018,31 +1962,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; + /* - * Only non-crosslock entries get new dependencies added. - * Crosslock entries will be added by commit later: + * Only non-recursive-read entries get new dependencies + * added: */ - if (!cross_lock(hlock->instance)) { + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + if (!ret) + return 0; + /* - * Only non-recursive-read entries get new dependencies - * added: + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, - distance, &trace, save_trace); - if (!ret) - return 0; - - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } + if (!hlock->trylock) + break; } + depth--; /* * End of lock-stack? @@ -3292,21 +3231,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - cross_init(lock, 0); __lockdep_init_map(lock, name, key, subclass); } EXPORT_SYMBOL_GPL(lockdep_init_map); -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - cross_init(lock, 1); - __lockdep_init_map(lock, name, key, subclass); -} -EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); -#endif - struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3362,7 +3290,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; - int ret; if (unlikely(!debug_locks)) return 0; @@ -3411,8 +3338,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - /* TODO: nest_lock is not implemented for crosslock yet. */ - if (depth && !cross_lock(lock)) { + if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3500,14 +3426,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; - ret = lock_acquire_crosslock(hlock); - /* - * 2 means normal acquire operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3745,19 +3663,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int ret, i; + int i; if (unlikely(!debug_locks)) return 0; - ret = lock_release_crosslock(lock); - /* - * 2 means normal release operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -4675,494 +4585,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); - -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - -/* - * Crossrelease works by recording a lock history for each thread and - * connecting those historic locks that were taken after the - * wait_for_completion() in the complete() context. - * - * Task-A Task-B - * - * mutex_lock(&A); - * mutex_unlock(&A); - * - * wait_for_completion(&C); - * lock_acquire_crosslock(); - * atomic_inc_return(&cross_gen_id); - * | - * | mutex_lock(&B); - * | mutex_unlock(&B); - * | - * | complete(&C); - * `-- lock_commit_crosslock(); - * - * Which will then add a dependency between B and C. - */ - -#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) - -/* - * Whenever a crosslock is held, cross_gen_id will be increased. - */ -static atomic_t cross_gen_id; /* Can be wrapped */ - -/* - * Make an entry of the ring buffer invalid. - */ -static inline void invalidate_xhlock(struct hist_lock *xhlock) -{ - /* - * Normally, xhlock->hlock.instance must be !NULL. - */ - xhlock->hlock.instance = NULL; -} - -/* - * Lock history stacks; we have 2 nested lock history stacks: - * - * HARD(IRQ) - * SOFT(IRQ) - * - * The thing is that once we complete a HARD/SOFT IRQ the future task locks - * should not depend on any of the locks observed while running the IRQ. So - * what we do is rewind the history buffer and erase all our knowledge of that - * temporal event. - */ - -void crossrelease_hist_start(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (!cur->xhlocks) - return; - - cur->xhlock_idx_hist[c] = cur->xhlock_idx; - cur->hist_id_save[c] = cur->hist_id; -} - -void crossrelease_hist_end(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (cur->xhlocks) { - unsigned int idx = cur->xhlock_idx_hist[c]; - struct hist_lock *h = &xhlock(idx); - - cur->xhlock_idx = idx; - - /* Check if the ring was overwritten. */ - if (h->hist_id != cur->hist_id_save[c]) - invalidate_xhlock(h); - } -} - -/* - * lockdep_invariant_state() is used to annotate independence inside a task, to - * make one task look like multiple independent 'tasks'. - * - * Take for instance workqueues; each work is independent of the last. The - * completion of a future work does not depend on the completion of a past work - * (in general). Therefore we must not carry that (lock) dependency across - * works. - * - * This is true for many things; pretty much all kthreads fall into this - * pattern, where they have an invariant state and future completions do not - * depend on past completions. Its just that since they all have the 'same' - * form -- the kthread does the same over and over -- it doesn't typically - * matter. - * - * The same is true for system-calls, once a system call is completed (we've - * returned to userspace) the next system call does not depend on the lock - * history of the previous system call. - * - * They key property for independence, this invariant state, is that it must be - * a point where we hold no locks and have no history. Because if we were to - * hold locks, the restore at _end() would not necessarily recover it's history - * entry. Similarly, independence per-definition means it does not depend on - * prior state. - */ -void lockdep_invariant_state(bool force) -{ - /* - * We call this at an invariant point, no current state, no history. - * Verify the former, enforce the latter. - */ - WARN_ON_ONCE(!force && current->lockdep_depth); - invalidate_xhlock(&xhlock(current->xhlock_idx)); -} - -static int cross_lock(struct lockdep_map *lock) -{ - return lock ? lock->cross : 0; -} - -/* - * This is needed to decide the relationship between wrapable variables. - */ -static inline int before(unsigned int a, unsigned int b) -{ - return (int)(a - b) < 0; -} - -static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) -{ - return hlock_class(&xhlock->hlock); -} - -static inline struct lock_class *xlock_class(struct cross_lock *xlock) -{ - return hlock_class(&xlock->hlock); -} - -/* - * Should we check a dependency with previous one? - */ -static inline int depend_before(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check && !hlock->trylock; -} - -/* - * Should we check a dependency with next one? - */ -static inline int depend_after(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check; -} - -/* - * Check if the xhlock is valid, which would be false if, - * - * 1. Has not used after initializaion yet. - * 2. Got invalidated. - * - * Remind hist_lock is implemented as a ring buffer. - */ -static inline int xhlock_valid(struct hist_lock *xhlock) -{ - /* - * xhlock->hlock.instance must be !NULL. - */ - return !!xhlock->hlock.instance; -} - -/* - * Record a hist_lock entry. - * - * Irq disable is only required. - */ -static void add_xhlock(struct held_lock *hlock) -{ - unsigned int idx = ++current->xhlock_idx; - struct hist_lock *xhlock = &xhlock(idx); - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * This can be done locklessly because they are all task-local - * state, we must however ensure IRQs are disabled. - */ - WARN_ON_ONCE(!irqs_disabled()); -#endif - - /* Initialize hist_lock's members */ - xhlock->hlock = *hlock; - xhlock->hist_id = ++current->hist_id; - - xhlock->trace.nr_entries = 0; - xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; - xhlock->trace.entries = xhlock->trace_entries; - - if (crossrelease_fullstack) { - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); - } else { - xhlock->trace.nr_entries = 1; - xhlock->trace.entries[0] = hlock->acquire_ip; - } -} - -static inline int same_context_xhlock(struct hist_lock *xhlock) -{ - return xhlock->hlock.irq_context == task_irq_context(current); -} - -/* - * This should be lockless as far as possible because this would be - * called very frequently. - */ -static void check_add_xhlock(struct held_lock *hlock) -{ - /* - * Record a hist_lock, only in case that acquisitions ahead - * could depend on the held_lock. For example, if the held_lock - * is trylock then acquisitions ahead never depends on that. - * In that case, we don't need to record it. Just return. - */ - if (!current->xhlocks || !depend_before(hlock)) - return; - - add_xhlock(hlock); -} - -/* - * For crosslock. - */ -static int add_xlock(struct held_lock *hlock) -{ - struct cross_lock *xlock; - unsigned int gen_id; - - if (!graph_lock()) - return 0; - - xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; - - /* - * When acquisitions for a crosslock are overlapped, we use - * nr_acquire to perform commit for them, based on cross_gen_id - * of the first acquisition, which allows to add additional - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - * - * depend_after() is necessary to initialize only the first - * valid xlock so that the xlock can be used on its commit. - */ - if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) - goto unlock; - - gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); - xlock->hlock = *hlock; - xlock->hlock.gen_id = gen_id; -unlock: - graph_unlock(); - return 1; -} - -/* - * Called for both normal and crosslock acquires. Normal locks will be - * pushed on the hist_lock queue. Cross locks will record state and - * stop regular lock_acquire() to avoid being placed on the held_lock - * stack. - * - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_acquire_crosslock(struct held_lock *hlock) -{ - /* - * CONTEXT 1 CONTEXT 2 - * --------- --------- - * lock A (cross) - * X = atomic_inc_return(&cross_gen_id) - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * Y = atomic_read_acquire(&cross_gen_id) - * lock B - * - * atomic_read_acquire() is for ordering between A and B, - * IOW, A happens before B, when CONTEXT 2 see Y >= X. - * - * Pairs with atomic_inc_return() in add_xlock(). - */ - hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); - - if (cross_lock(hlock->instance)) - return add_xlock(hlock); - - check_add_xhlock(hlock); - return 2; -} - -static int copy_trace(struct stack_trace *trace) -{ - unsigned long *buf = stack_trace + nr_stack_trace_entries; - unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - unsigned int nr = min(max_nr, trace->nr_entries); - - trace->nr_entries = nr; - memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); - trace->entries = buf; - nr_stack_trace_entries += nr; - - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); - dump_stack(); - - return 0; - } - - return 1; -} - -static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) -{ - unsigned int xid, pid; - u64 chain_key; - - xid = xlock_class(xlock) - lock_classes; - chain_key = iterate_chain_key((u64)0, xid); - pid = xhlock_class(xhlock) - lock_classes; - chain_key = iterate_chain_key(chain_key, pid); - - if (lookup_chain_cache(chain_key)) - return 1; - - if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, - chain_key)) - return 0; - - if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, - &xhlock->trace, copy_trace)) - return 0; - - return 1; -} - -static void commit_xhlocks(struct cross_lock *xlock) -{ - unsigned int cur = current->xhlock_idx; - unsigned int prev_hist_id = xhlock(cur).hist_id; - unsigned int i; - - if (!graph_lock()) - return; - - if (xlock->nr_acquire) { - for (i = 0; i < MAX_XHLOCKS_NR; i++) { - struct hist_lock *xhlock = &xhlock(cur - i); - - if (!xhlock_valid(xhlock)) - break; - - if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) - break; - - if (!same_context_xhlock(xhlock)) - break; - - /* - * Filter out the cases where the ring buffer was - * overwritten and the current entry has a bigger - * hist_id than the previous one, which is impossible - * otherwise: - */ - if (unlikely(before(prev_hist_id, xhlock->hist_id))) - break; - - prev_hist_id = xhlock->hist_id; - - /* - * commit_xhlock() returns 0 with graph_lock already - * released if fail. - */ - if (!commit_xhlock(xlock, xhlock)) - return; - } - } - - graph_unlock(); -} - -void lock_commit_crosslock(struct lockdep_map *lock) -{ - struct cross_lock *xlock; - unsigned long flags; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (!current->xhlocks) - return; - - /* - * Do commit hist_locks with the cross_lock, only in case that - * the cross_lock could depend on acquisitions after that. - * - * For example, if the cross_lock does not have the 'check' flag - * then we don't need to check dependencies and commit for that. - * Just skip it. In that case, of course, the cross_lock does - * not depend on acquisitions ahead, either. - * - * WARNING: Don't do that in add_xlock() in advance. When an - * acquisition context is different from the commit context, - * invalid(skipped) cross_lock might be accessed. - */ - if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - xlock = &((struct lockdep_map_cross *)lock)->xlock; - commit_xhlocks(xlock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_commit_crosslock); - -/* - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_release_crosslock(struct lockdep_map *lock) -{ - if (cross_lock(lock)) { - if (!graph_lock()) - return 0; - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; - graph_unlock(); - return 1; - } - return 2; -} - -static void cross_init(struct lockdep_map *lock, int cross) -{ - if (cross) - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; - - lock->cross = cross; - - /* - * Crossrelease assumes that the ring buffer size of xhlocks - * is aligned with power of 2. So force it on build. - */ - BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); -} - -void lockdep_init_task(struct task_struct *task) -{ - int i; - - task->xhlock_idx = UINT_MAX; - task->hist_id = 0; - - for (i = 0; i < XHLOCK_CTX_NR; i++) { - task->xhlock_idx_hist[i] = UINT_MAX; - task->hist_id_save[i] = 0; - } - - task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, - GFP_KERNEL); -} - -void lockdep_free_task(struct task_struct *task) -{ - if (task->xhlocks) { - void *tmp = task->xhlocks; - /* Diable crossrelease for current */ - task->xhlocks = NULL; - kfree(tmp); - } -} -#endif diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 1fd1a7543cdd..936f3d14dd6b 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ break; \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ } \ \ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ @@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ local_irq_restore(flags); \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ + \ return flags; \ } \ \ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..b9006617710f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3141,9 +3141,6 @@ void dump_stack_print_info(const char *log_lvl) void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - - printk("%stask: %p task.stack: %p\n", - log_lvl, current, task_stack_page(current)); } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..644fa2e3d993 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) return ret; } -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; @@ -5144,6 +5133,17 @@ out_unlock: return retval; } +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. + */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2f52ec0f1539..d6717a3331a1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, #ifdef CONFIG_NO_HZ_COMMON static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls(); + unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); bool ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4037e19bbca2..2fe3aa853e4d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3413,9 +3413,9 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * - * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and - * simply copies the running sum over. + * Per the above update_tg_cfs_util() is trivial and simply copies the running + * sum over (but still wrong, because the group entity and group rq do not have + * their PELT windows aligned). * * However, update_tg_cfs_runnable() is more complex. So we have: * @@ -3424,11 +3424,11 @@ void set_task_rq_fair(struct sched_entity *se, * And since, like util, the runnable part should be directly transferable, * the following would _appear_ to be the straight forward approach: * - * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) + * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3) * * And per (1) we have: * - * ge->avg.running_avg == grq->avg.running_avg + * ge->avg.runnable_avg == grq->avg.runnable_avg * * Which gives: * @@ -3447,27 +3447,28 @@ void set_task_rq_fair(struct sched_entity *se, * to (shortly) return to us. This only works by keeping the weights as * integral part of the sum. We therefore cannot decompose as per (3). * - * OK, so what then? + * Another reason this doesn't work is that runnable isn't a 0-sum entity. + * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the + * rq itself is runnable anywhere between 2/3 and 1 depending on how the + * runnable section of these tasks overlap (or not). If they were to perfectly + * align the rq as a whole would be runnable 2/3 of the time. If however we + * always have at least 1 runnable task, the rq as a whole is always runnable. * + * So we'll have to approximate.. :/ * - * Another way to look at things is: + * Given the constraint: * - * grq->avg.load_avg = \Sum se->avg.load_avg + * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX * - * Therefore, per (2): + * We can construct a rule that adds runnable to a rq by assuming minimal + * overlap. * - * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg + * On removal, we'll assume each task is equally runnable; which yields: * - * And the very thing we're propagating is a change in that sum (someone - * joined/left). So we can easily know the runnable change, which would be, per - * (2) the already tracked se->load_avg divided by the corresponding - * se->weight. + * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight * - * Basically (4) but in differential form: + * XXX: only do this for the part of runnable > running ? * - * d(runnable_avg) += se->avg.load_avg / se->load.weight - * (5) - * ge->avg.load_avg += ge->load.weight * d(runnable_avg) */ static inline void @@ -3479,6 +3480,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq if (!delta) return; + /* + * The relation between sum and avg is: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * however, the PELT windows are not aligned between grq and gse. + */ + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; @@ -3491,33 +3500,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long runnable_sum = gcfs_rq->prop_runnable_sum; - long runnable_load_avg, load_avg; - s64 runnable_load_sum, load_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + unsigned long runnable_load_avg, load_avg; + u64 runnable_load_sum, load_sum = 0; + s64 delta_sum; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + if (runnable_sum >= 0) { + /* + * Add runnable; clip at LOAD_AVG_MAX. Reflects that until + * the CPU is saturated running == runnable. + */ + runnable_sum += se->avg.load_sum; + runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + } else { + /* + * Estimate the new unweighted runnable_sum of the gcfs_rq by + * assuming all tasks are equally runnable. + */ + if (scale_load_down(gcfs_rq->load.weight)) { + load_sum = div_s64(gcfs_rq->avg.load_sum, + scale_load_down(gcfs_rq->load.weight)); + } + + /* But make sure to not inflate se's runnable */ + runnable_sum = min(se->avg.load_sum, load_sum); + } + + /* + * runnable_sum can't be lower than running_sum + * As running sum is scale with cpu capacity wehreas the runnable sum + * is not we rescale running_sum 1st + */ + running_sum = se->avg.util_sum / + arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + runnable_sum = max(runnable_sum, running_sum); + load_sum = (s64)se_weight(se) * runnable_sum; load_avg = div_s64(load_sum, LOAD_AVG_MAX); - add_positive(&se->avg.load_sum, runnable_sum); - add_positive(&se->avg.load_avg, load_avg); + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; + delta_avg = load_avg - se->avg.load_avg; - add_positive(&cfs_rq->avg.load_avg, load_avg); - add_positive(&cfs_rq->avg.load_sum, load_sum); + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); + delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - add_positive(&se->avg.runnable_load_sum, runnable_sum); - add_positive(&se->avg.runnable_load_avg, runnable_load_avg); + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { - add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); + add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..665ace2fc558 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) + if (likely(!rt_overload_count)) return; /* @@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 98feab7933c7..929ecb7d6b78 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - __add_wait_queue_entry_tail(wq_head, wq_entry); + __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e776fc8cc1df..f6b5f19223d6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -95,6 +95,7 @@ config NO_HZ_FULL select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK + select CPU_ISOLATION help Adaptively try to shutdown the tick whenever possible, even when the CPU is running tasks. Typically this requires running a single diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 13d6881f908b..ec999f32c840 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + rtn = find_task_by_vpid(event->sigev_notify_thread_id); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return task_pid(rtn); + default: return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); + } } static struct k_itimer * alloc_posix_timer(void) @@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ @@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); - sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99578f06c8d4..f7cc7abfcf25 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) ts->next_tick = 0; } +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & TIMER_SOFTIRQ; +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; - if (rcu_needs_cpu(basemono, &next_rcu) || - arch_needs_cpu() || irq_work_needs_cpu()) { + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immeditate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* @@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void) } /** + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value + * for a particular CPU. + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls_cpu(int cpu) +{ + struct tick_sched *ts = tick_get_tick_sched(cpu); + + return ts->idle_calls; +} + +/** * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ffebcf878fba..89a9e1b4264a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); return base; } @@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]); return base; } @@ -1009,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; - debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags); if (base != new_base) { @@ -1034,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } + debug_activate(timer, expires); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) base->must_forward_clk = false; __run_timers(base); - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } @@ -1855,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(&timer_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->is_idle = false; + base->must_forward_clk = true; + } + return 0; +} + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..904c952ac383 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS depends on DEBUG_PREEMPT || !PROVE_LOCKING + depends on TRACING default n help Enable tracing of disable and enable events for preemption and irqs. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 27d1f4ffa3de..40207c2a4113 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_raw_record *raw) + u64 flags, struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; @@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(sd, 0, 0); - sd->raw = raw; perf_event_output(event, sd, regs); return 0; } @@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); struct perf_raw_record raw = { .frag = { .size = size, @@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; - return __bpf_perf_event_output(regs, map, flags, &raw); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; + + return __bpf_perf_event_output(regs, map, flags, sd); } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); struct perf_raw_frag frag = { .copy = ctx_copy, @@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; perf_fetch_caller_regs(regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, &raw); + return __bpf_perf_event_output(regs, map, flags, sd); } BPF_CALL_0(bpf_get_current_task) @@ -759,6 +764,8 @@ const struct bpf_prog_ops perf_event_prog_ops = { static DEFINE_MUTEX(bpf_event_mutex); +#define BPF_TRACE_MAX_PROGS 64 + int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { @@ -772,6 +779,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event, goto unlock; old_array = event->tp_event->prog_array; + if (old_array && + bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { + ret = -E2BIG; + goto unlock; + } + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) goto unlock; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..9ab18995ff1e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage) */ size_t ring_buffer_page_len(void *page) { - return local_read(&((struct buffer_data_page *)page)->commit) + struct buffer_data_page *bpage = page; + + return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) + BUF_PAGE_HDR_SIZE; } @@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static __always_inline void * -__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) -{ - return bpage->data + index; -} - static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; @@ -4406,8 +4404,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct buffer_data_page *bpage = data; + struct page *page = virt_to_page(bpage); unsigned long flags; + /* If the page is still in use someplace else, we can't reuse it */ + if (page_ref_count(page) > 1) + goto out; + local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4419,6 +4422,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + out: free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..2a8d8a294345 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct } /** - * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove @@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr) } /** - * trace_snapshot - take a snapshot of the current buffer. + * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live @@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void) EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** - * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * - * This is similar to trace_snapshot(), but it will allocate the + * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * @@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) @@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export, entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); - export->write(entry, size); + export->write(export, entry, size); } static DEFINE_MUTEX(ftrace_export_lock); @@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops = { .llseek = seq_lseek, }; -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str; int len; - mutex_lock(&tracing_cpumask_update_lock); + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; - len = snprintf(mask_str, count, "%*pb\n", + len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: - mutex_unlock(&tracing_cpumask_update_lock); + kfree(mask_str); return count; } @@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - mutex_lock(&tracing_cpumask_update_lock); - local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { @@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); - - mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); return count; @@ -6780,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int entries, size, i; + int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE @@ -6834,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - /* - * zero out any left over data, this is going to - * user land. - */ - size = ring_buffer_page_len(ref->page); - if (size < PAGE_SIZE) - memset(ref->page + size, 0, PAGE_SIZE - size); - page = virt_to_page(ref->page); spd.pages[i] = page; @@ -7599,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); + buf->buffer = NULL; return -ENOMEM; } @@ -7622,7 +7604,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot ? size : 1); if (WARN_ON(ret)) { ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; free_percpu(tr->trace_buffer.data); + tr->trace_buffer.data = NULL; return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 734accc02418..3c7bfc4bf5e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (__this_cpu_read(disable_stack_tracer) != 1) goto out; + /* If rcu is not watching, then save stack trace can fail */ + if (!rcu_is_watching()) + goto out; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710bfdd7..43d18cb46308 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -38,7 +38,6 @@ #include <linux/hardirq.h> #include <linux/mempolicy.h> #include <linux/freezer.h> -#include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> @@ -48,6 +47,7 @@ #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> +#include <linux/sched/isolation.h> #include "workqueue_internal.h" @@ -1634,7 +1634,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because wq_unbind_fn() releases + * Sanity check nr_running. Because unbind_workers() releases * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. @@ -4510,9 +4510,8 @@ void show_workqueue_state(void) * cpu comes back online. */ -static void wq_unbind_fn(struct work_struct *work) +static void unbind_workers(int cpu) { - int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; @@ -4589,16 +4588,6 @@ static void rebind_workers(struct worker_pool *pool) spin_lock_irq(&pool->lock); - /* - * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED - * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is - * being reworked and this can go away in time. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - spin_unlock_irq(&pool->lock); - return; - } - pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { @@ -4709,12 +4698,13 @@ int workqueue_online_cpu(unsigned int cpu) int workqueue_offline_cpu(unsigned int cpu) { - struct work_struct unbind_work; struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + if (WARN_ON(cpu != smp_processor_id())) + return -1; + + unbind_workers(cpu); /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); @@ -4722,9 +4712,6 @@ int workqueue_offline_cpu(unsigned int cpu) wq_update_unbound_numa(wq, cpu, false); mutex_unlock(&wq_pool_mutex); - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); return 0; } @@ -4957,6 +4944,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5555,7 +5546,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
