diff options
Diffstat (limited to 'kernel')
104 files changed, 1832 insertions, 965 deletions
diff --git a/kernel/audit.h b/kernel/audit.h index a60d2840559e..5156ecd35457 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -199,7 +199,7 @@ struct audit_context { int argc; } execve; struct { - char *name; + const char *name; } module; struct { struct audit_ntp_data ntp_data; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cd57053b4a69..dae80e4dfcce 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2870,7 +2870,7 @@ void __audit_openat2_how(struct open_how *how) context->type = AUDIT_OPENAT2; } -void __audit_log_kern_module(char *name) +void __audit_log_kern_module(const char *name) { struct audit_context *context = audit_context(); diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 6547fb7ac0dc..129a51b1da1b 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -162,6 +162,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; + bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -170,21 +171,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - if (!bpf_cgrp_storage_trylock()) - return (unsigned long)NULL; + nobusy = bpf_cgrp_storage_trylock(); - sdata = cgroup_storage_lookup(cgroup, map, true); + sdata = cgroup_storage_lookup(cgroup, map, nobusy); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, gfp_flags); unlock: - bpf_cgrp_storage_unlock(); + if (nobusy) + bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..2d6e1c98d8ad 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index b70d0eef8a28..2285b27ce68c 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -147,39 +147,6 @@ void bpf_struct_ops_image_free(void *image) } #define MAYBE_NULL_SUFFIX "__nullable" -#define MAX_STUB_NAME 128 - -/* Return the type info of a stub function, if it exists. - * - * The name of a stub function is made up of the name of the struct_ops and - * the name of the function pointer member, separated by "__". For example, - * if the struct_ops type is named "foo_ops" and the function pointer - * member is named "bar", the stub function name would be "foo_ops__bar". - */ -static const struct btf_type * -find_stub_func_proto(const struct btf *btf, const char *st_op_name, - const char *member_name) -{ - char stub_func_name[MAX_STUB_NAME]; - const struct btf_type *func_type; - s32 btf_id; - int cp; - - cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", - st_op_name, member_name); - if (cp >= MAX_STUB_NAME) { - pr_warn("Stub function name too long\n"); - return NULL; - } - btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); - if (btf_id < 0) - return NULL; - func_type = btf_type_by_id(btf, btf_id); - if (!func_type) - return NULL; - - return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ -} /* Prepare argument info for every nullable argument of a member of a * struct_ops type. @@ -204,27 +171,42 @@ find_stub_func_proto(const struct btf *btf, const char *st_op_name, static int prepare_arg_info(struct btf *btf, const char *st_ops_name, const char *member_name, - const struct btf_type *func_proto, + const struct btf_type *func_proto, void *stub_func_addr, struct bpf_struct_ops_arg_info *arg_info) { const struct btf_type *stub_func_proto, *pointed_type; const struct btf_param *stub_args, *args; struct bpf_ctx_arg_aux *info, *info_buf; u32 nargs, arg_no, info_cnt = 0; + char ksym[KSYM_SYMBOL_LEN]; + const char *stub_fname; + s32 stub_func_id; u32 arg_btf_id; int offset; - stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); - if (!stub_func_proto) - return 0; + stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym); + if (!stub_fname) { + pr_warn("Cannot find the stub function name for the %s in struct %s\n", + member_name, st_ops_name); + return -ENOENT; + } + + stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC); + if (stub_func_id < 0) { + pr_warn("Cannot find the stub function %s in btf\n", stub_fname); + return -ENOENT; + } + + stub_func_proto = btf_type_by_id(btf, stub_func_id); + stub_func_proto = btf_type_by_id(btf, stub_func_proto->type); /* Check if the number of arguments of the stub function is the same * as the number of arguments of the function pointer. */ nargs = btf_type_vlen(func_proto); if (nargs != btf_type_vlen(stub_func_proto)) { - pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", - st_ops_name, member_name, member_name, st_ops_name); + pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n", + stub_fname, member_name, st_ops_name); return -EINVAL; } @@ -254,21 +236,21 @@ static int prepare_arg_info(struct btf *btf, &arg_btf_id); if (!pointed_type || !btf_type_is_struct(pointed_type)) { - pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", - st_ops_name, member_name, MAYBE_NULL_SUFFIX); + pr_warn("stub function %s has %s tagging to an unsupported type\n", + stub_fname, MAYBE_NULL_SUFFIX); goto err_out; } offset = btf_ctx_arg_offset(btf, func_proto, arg_no); if (offset < 0) { - pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", - st_ops_name, member_name, arg_no); + pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n", + stub_fname, arg_no); goto err_out; } if (args[arg_no].type != stub_args[arg_no].type) { - pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", - arg_no, st_ops_name, member_name); + pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n", + arg_no, stub_fname); goto err_out; } @@ -325,6 +307,13 @@ static bool is_module_member(const struct btf *btf, u32 id) return !strcmp(btf_name_by_offset(btf, t->name_off), "module"); } +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); + + return func_ptr ? 0 : -ENOTSUPP; +} + int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) @@ -388,7 +377,10 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, for_each_member(i, t, member) { const struct btf_type *func_proto; + void **stub_func_addr; + u32 moff; + moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(btf, member->name_off); if (!*mname) { pr_warn("anon member in struct %s is not supported\n", @@ -414,7 +406,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); - if (!func_proto) + + /* The member is not a function pointer or + * the function pointer is not supported. + */ + if (!func_proto || bpf_struct_ops_supported(st_ops, moff)) continue; if (btf_distill_func_proto(log, btf, @@ -426,8 +422,9 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, goto errout; } + stub_func_addr = *(void **)(st_ops->cfi_stubs + moff); err = prepare_arg_info(btf, st_ops->name, mname, - func_proto, + func_proto, stub_func_addr, arg_info + i); if (err) goto errout; @@ -580,7 +577,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); if (size <= 0) return size ? : -EFAULT; @@ -1153,13 +1150,6 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } -int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) -{ - void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); - - return func_ptr ? 0 : -ENOTSUPP; -} - static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c54c148a94f..f83bd019db14 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6684,10 +6684,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* Is this a func with potential NULL args? */ if (strcmp(tname, raw_tp_null_args[i].func)) continue; - if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4))) info->reg_type |= PTR_MAYBE_NULL; /* Is the current arg IS_ERR? */ - if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4))) ptr_err_raw_tp = true; break; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 025d7e2214ae..c0d606c40195 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct hlist_head *head) +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) { struct bpf_prog_list *pl; u32 cnt = 0; @@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head) hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) + (*preorder_cnt)++; cnt++; } return cnt; @@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[atype]); + cnt = prog_list_length(&p->bpf.progs[atype], NULL); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp, struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[atype]); + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); p = cgroup_parent(p); } while (p); @@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp, /* populate the array with effective progs */ cnt = 0; p = cgrp; + fstart = preorder_cnt; + bstart = preorder_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; + init_bstart = bstart; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; - item = &progs->items[cnt]; + if (pl->flags & BPF_F_PREORDER) { + item = &progs->items[bstart]; + bstart--; + } else { + item = &progs->items[fstart]; + fstart++; + } item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } + + /* reverse pre-ordering progs at this cgroup level */ + for (i = bstart + 1, j = init_bstart; i < j; i++, j--) + swap(progs->items[i], progs->items[j]); + } while ((p = cgroup_parent(p))); *array = progs; @@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, */ return -EPERM; - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; pl = find_attach_entry(progs, prog, link, replace_prog, @@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; + pl->flags = flags; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; @@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); } else { - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); } } @@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; progs = &cgrp->bpf.progs[atype]; - cnt = min_t(int, prog_list_length(progs), total_cnt); + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); i = 0; hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a60a6a2ce0d7..9380e0fd5e4a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -778,7 +778,10 @@ bool is_bpf_text_address(unsigned long addr) struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { - struct bpf_ksym *ksym = bpf_ksym_find(addr); + struct bpf_ksym *ksym; + + WARN_ON_ONCE(!rcu_read_lock_held()); + ksym = bpf_ksym_find(addr); return ksym && ksym->prog ? container_of(ksym, struct bpf_prog_aux, ksym)->prog : @@ -2296,47 +2299,54 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; } -bool bpf_prog_map_compatible(struct bpf_map *map, - const struct bpf_prog *fp) +static bool __bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); - bool ret; struct bpf_prog_aux *aux = fp->aux; + enum bpf_cgroup_storage_type i; + bool ret = false; + u64 cookie; if (fp->kprobe_override) - return false; - - /* XDP programs inserted into maps are not guaranteed to run on - * a particular netdev (and can run outside driver context entirely - * in the case of devmap and cpumap). Until device checks - * are implemented, prohibit adding dev-bound programs to program maps. - */ - if (bpf_prog_is_dev_bound(aux)) - return false; + return ret; - spin_lock(&map->owner.lock); - if (!map->owner.type) { - /* There's no owner yet where we could check for - * compatibility. - */ - map->owner.type = prog_type; - map->owner.jited = fp->jited; - map->owner.xdp_has_frags = aux->xdp_has_frags; - map->owner.attach_func_proto = aux->attach_func_proto; + spin_lock(&map->owner_lock); + /* There's no owner yet where we could check for compatibility. */ + if (!map->owner) { + map->owner = bpf_map_owner_alloc(map); + if (!map->owner) + goto err; + map->owner->type = prog_type; + map->owner->jited = fp->jited; + map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->attach_func_proto = aux->attach_func_proto; + for_each_cgroup_storage_type(i) { + map->owner->storage_cookie[i] = + aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + } ret = true; } else { - ret = map->owner.type == prog_type && - map->owner.jited == fp->jited && - map->owner.xdp_has_frags == aux->xdp_has_frags; + ret = map->owner->type == prog_type && + map->owner->jited == fp->jited && + map->owner->xdp_has_frags == aux->xdp_has_frags; + for_each_cgroup_storage_type(i) { + if (!ret) + break; + cookie = aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + ret = map->owner->storage_cookie[i] == cookie || + !cookie; + } if (ret && - map->owner.attach_func_proto != aux->attach_func_proto) { + map->owner->attach_func_proto != aux->attach_func_proto) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: @@ -2349,11 +2359,24 @@ bool bpf_prog_map_compatible(struct bpf_map *map, } } } - spin_unlock(&map->owner.lock); - +err: + spin_unlock(&map->owner_lock); return ret; } +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) +{ + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + + return __bpf_prog_map_compatible(map, fp); +} + static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; @@ -2366,7 +2389,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) if (!map_type_contains_progs(map)) continue; - if (!bpf_prog_map_compatible(map, fp)) { + if (!__bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } @@ -2377,8 +2400,9 @@ out: return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2387,15 +2411,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2423,7 +2448,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc..83c4d9943084 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ec941a0ea41..570e2f723144 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -198,12 +198,12 @@ static bool htab_is_percpu(const struct bpf_htab *htab) static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + key_size) = pptr; + *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + key_size); + return *(void __percpu **)(l->key + roundup(key_size, 8)); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) @@ -2223,7 +2223,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; - hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ @@ -2355,7 +2355,7 @@ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, - offsetof(struct htab_elem, key) + map->key_size); + offsetof(struct htab_elem, key) + roundup(map->key_size, 8)); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a05aeb345896..a0bf39b7359a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -129,7 +129,8 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } @@ -882,6 +883,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1])) { + if (tmp_buf) + cur_arg = raw_args[num_spec]; + goto nocopy_fmt; + } + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; @@ -889,11 +897,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, goto fmt_str; } - if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || - ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + if (fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { - /* just kernel pointers */ if (tmp_buf) cur_arg = raw_args[num_spec]; i++; @@ -1270,8 +1276,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until + * kmalloc_nolock() is available, avoid locking issues by using + * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). + */ + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -2775,9 +2784,16 @@ static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; - if (!is_bpf_text_address(ip)) - return !ctx->cnt; + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return !ctx->cnt; ctx->cnt++; if (bpf_is_subprog(prog)) return true; diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index c9d45c9d6918..f9b11d01c3b5 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -10,7 +10,6 @@ menuconfig BPF_PRELOAD # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST - select USERMODE_DRIVER help This builds kernel module with several embedded BPF programs that are pinned into BPF FS mount point as human readable files that are diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 0c63bc2cd895..fdad0eb308fe 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -89,4 +89,5 @@ static void __exit fini(void) } late_initcall(load); module_exit(fini); +MODULE_IMPORT_NS(BPF_INTERNAL); MODULE_LICENSE("GPL"); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 696e5a2cbea2..ba4543e771a6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> +#include <linux/cookie.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -51,6 +52,7 @@ #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) DEFINE_PER_CPU(int, bpf_prog_active); +DEFINE_COOKIE(bpf_map_cookie); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); @@ -765,6 +767,7 @@ static void bpf_map_free_deferred(struct work_struct *work) security_bpf_map_free(map); bpf_map_release_memcg(map); + bpf_map_owner_free(map); bpf_map_free(map); } @@ -859,12 +862,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map_type_contains_progs(map)) { - spin_lock(&map->owner.lock); - type = map->owner.type; - jited = map->owner.jited; - spin_unlock(&map->owner.lock); + spin_lock(&map->owner_lock); + if (map->owner) { + type = map->owner->type; + jited = map->owner->jited; } + spin_unlock(&map->owner_lock); seq_printf(m, "map_type:\t%u\n" @@ -1360,10 +1363,14 @@ static int map_create(union bpf_attr *attr) if (err < 0) goto free_map; + preempt_disable(); + map->cookie = gen_cookie_next(&bpf_map_cookie); + preempt_enable(); + atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - spin_lock_init(&map->owner.lock); + spin_lock_init(&map->owner_lock); if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, @@ -1457,7 +1464,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } -EXPORT_SYMBOL(bpf_map_get); +EXPORT_SYMBOL_NS(bpf_map_get, BPF_INTERNAL); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -3223,7 +3230,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) bpf_link_inc(link); return link; } -EXPORT_SYMBOL(bpf_link_get_from_fd); +EXPORT_SYMBOL_NS(bpf_link_get_from_fd, BPF_INTERNAL); static void bpf_tracing_link_release(struct bpf_link *link) { @@ -4042,7 +4049,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PREORDER) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ @@ -4606,6 +4614,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; + if (prog->aux->btf) + info.btf_id = btf_obj_id(prog->aux->btf); if (!bpf_capable()) { info.jited_prog_len = 0; @@ -4752,8 +4762,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) - info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); @@ -5853,7 +5861,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) return ____bpf_sys_bpf(cmd, attr, size); } } -EXPORT_SYMBOL(kern_sys_bpf); +EXPORT_SYMBOL_NS(kern_sys_bpf, BPF_INTERNAL); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9000806ee3ba..24ae8f33e5d7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -400,7 +400,8 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || - type == PTR_TO_MEM; + type == PTR_TO_MEM || + type == CONST_PTR_TO_MAP; } static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) @@ -1376,13 +1377,6 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } -static void clear_jmp_history(struct bpf_verifier_state *state) -{ - kfree(state->jmp_history); - state->jmp_history = NULL; - state->jmp_history_cnt = 0; -} - static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -1392,7 +1386,6 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } - clear_jmp_history(state); if (free_self) kfree(state); } @@ -1418,13 +1411,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, struct bpf_func_state *dst; int i, err; - dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, - src->jmp_history_cnt, sizeof(*dst_state->jmp_history), - GFP_USER); - if (!dst_state->jmp_history) - return -ENOMEM; - dst_state->jmp_history_cnt = src->jmp_history_cnt; - /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1443,10 +1429,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; + dst_state->insn_hist_start = src->insn_hist_start; + dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; + dst_state->loop_entry = src->loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -2495,9 +2484,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). + * But we do need to make sure to not clobber insn_hist, so we keep + * chaining insn_hist_start/insn_hist_end indices as for a normal + * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; + elem->st.insn_hist_start = env->cur_state->insn_hist_end; + elem->st.insn_hist_end = elem->st.insn_hist_start; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -2528,16 +2522,36 @@ static int cmp_subprogs(const void *a, const void *b) ((struct bpf_subprog_info *)b)->start; } +/* Find subprogram that contains instruction at 'off' */ +static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *vals = env->subprog_info; + int l, r, m; + + if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0) + return NULL; + + l = 0; + r = env->subprog_cnt - 1; + while (l < r) { + m = l + (r - l + 1) / 2; + if (vals[m].start <= off) + l = m; + else + r = m - 1; + } + return &vals[l]; +} + +/* Find subprogram that starts exactly at 'off' */ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = bsearch(&off, env->subprog_info, env->subprog_cnt, - sizeof(env->subprog_info[0]), cmp_subprogs); - if (!p) + p = find_containing_subprog(env, off); + if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; - } static int add_subprog(struct bpf_verifier_env *env, int off) @@ -2967,6 +2981,21 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, return res ? &res->func_model : NULL; } +static int add_kfunc_in_insns(struct bpf_verifier_env *env, + struct bpf_insn *insn, int cnt) +{ + int i, ret; + + for (i = 0; i < cnt; i++, insn++) { + if (bpf_pseudo_kfunc_call(insn)) { + ret = add_kfunc_call(env, insn->imm, insn->off); + if (ret < 0) + return ret; + } + } + return 0; +} + static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; @@ -3477,11 +3506,10 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) +static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) { - u32 cnt = cur->jmp_history_cnt; - struct bpf_jmp_history_entry *p; + struct bpf_insn_hist_entry *p; size_t alloc_size; /* combine instruction flags if we already recorded this instruction */ @@ -3501,29 +3529,32 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st return 0; } - cnt++; - alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); - p = krealloc(cur->jmp_history, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - cur->jmp_history = p; + if (cur->insn_hist_end + 1 > env->insn_hist_cap) { + alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); + p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); + if (!p) + return -ENOMEM; + env->insn_hist = p; + env->insn_hist_cap = alloc_size / sizeof(*p); + } - p = &cur->jmp_history[cnt - 1]; + p = &env->insn_hist[cur->insn_hist_end]; p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; p->linked_regs = linked_regs; - cur->jmp_history_cnt = cnt; + + cur->insn_hist_end++; env->cur_hist_ent = p; return 0; } -static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, - u32 hist_end, int insn_idx) +static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, + u32 hist_start, u32 hist_end, int insn_idx) { - if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) - return &st->jmp_history[hist_end - 1]; + if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) + return &env->insn_hist[hist_end - 1]; return NULL; } @@ -3540,25 +3571,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat * history entry recording a jump from last instruction of parent state and * first instruction of given state. */ -static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, - u32 *history) +static int get_prev_insn_idx(const struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + int insn_idx, u32 hist_start, u32 *hist_endp) { - u32 cnt = *history; + u32 hist_end = *hist_endp; + u32 cnt = hist_end - hist_start; - if (i == st->first_insn_idx) { + if (insn_idx == st->first_insn_idx) { if (cnt == 0) return -ENOENT; - if (cnt == 1 && st->jmp_history[0].idx == i) + if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) return -ENOENT; } - if (cnt && st->jmp_history[cnt - 1].idx == i) { - i = st->jmp_history[cnt - 1].prev_idx; - (*history)--; + if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { + (*hist_endp)--; + return env->insn_hist[hist_end - 1].prev_idx; } else { - i--; + return insn_idx - 1; } - return i; } static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) @@ -3730,7 +3762,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -3775,7 +3807,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) + struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -4035,8 +4067,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * before it would be equally necessary to * propagate it to dreg. */ - bt_set_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK)) + bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_DST_REG_STACK)) + bt_set_reg(bt, dreg); } else if (BPF_SRC(insn->code) == BPF_K) { /* dreg <cond> K * Only dreg still needs precision before @@ -4194,7 +4228,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded jmp_history and is able to + * logic). This backtracking relies on recorded insn_hist and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. @@ -4311,8 +4345,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); - u32 history = st->jmp_history_cnt; - struct bpf_jmp_history_entry *hist; + u32 hist_start = st->insn_hist_start; + u32 hist_end = st->insn_hist_end; + struct bpf_insn_hist_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4351,7 +4386,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - hist = get_jmp_hist_entry(st, history, i); + hist = get_insn_hist_entry(env, hist_start, hist_end, i); err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { @@ -4368,7 +4403,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) */ return 0; subseq_idx = i; - i = get_prev_insn_idx(st, i, &history); + i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); if (i == -ENOENT) break; if (i >= env->prog->len) { @@ -4735,7 +4770,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return push_insn_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5042,7 +5077,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return push_insn_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -9811,6 +9846,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "Func#%d ('%s') is global and assumed valid.\n", subprog, sub_name); + if (env->subprog_info[subprog].changes_pkt_data) + clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); @@ -15016,6 +15053,10 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state if (!is_reg_const(reg2, is_jmp32)) break; val = reg_const_value(reg2, is_jmp32); + /* Forget the ranges before narrowing tnums, to avoid invariant + * violations if we're on a dead branch. + */ + __mark_reg_unbounded(reg1); if (is_jmp32) { t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); @@ -15381,6 +15422,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *eq_branch_regs; struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); + int insn_flags = 0; bool is_jmp32; int pred = -1; int err; @@ -15440,6 +15482,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + + if (src_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_SRC_REG_STACK; + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -15449,6 +15496,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); + + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; + } + + if (insn_flags) { + err = push_insn_history(env, this_branch, insn_flags, 0); + if (err) + return err; } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; @@ -15504,7 +15560,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -16001,6 +16057,29 @@ enforce_retval: return 0; } +static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->changes_pkt_data = true; +} + +/* 't' is an index of a call-site. + * 'w' is a callee entry point. + * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. + * Rely on DFS traversal order and absence of recursive calls to guarantee that + * callee's change_pkt_data marks would be correct at that moment. + */ +static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) +{ + struct bpf_subprog_info *caller, *callee; + + caller = find_containing_subprog(env, t); + callee = find_containing_subprog(env, w); + caller->changes_pkt_data |= callee->changes_pkt_data; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -16134,6 +16213,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, bool visit_callee) { int ret, insn_sz; + int w; insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); @@ -16145,8 +16225,10 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, mark_jmp_point(env, t + insn_sz); if (visit_callee) { + w = t + insns[t].imm + 1; mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + merge_callee_effects(env, t, w); + ret = push_insn(t, w, BRANCH, env); } return ret; } @@ -16466,6 +16548,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } + if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -16600,6 +16684,7 @@ walk_cfg: } } ret = 0; /* cfg looks good */ + env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; err_free: kvfree(insn_state); @@ -17216,12 +17301,16 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { + struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; sl = *explored_state(env, insn); while (sl) { if (sl->state.branches) goto next; + loop_entry = get_loop_entry(&sl->state); + if (loop_entry && loop_entry->branches) + goto next; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) goto next; @@ -17913,7 +18002,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ - cur->jmp_history_cnt > 40; + cur->insn_hist_end - cur->insn_hist_start > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -18111,7 +18200,7 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0, 0); + err = err ? : push_insn_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -18210,8 +18299,8 @@ next: cur->parent = new; cur->first_insn_idx = insn_idx; + cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; - clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -18379,7 +18468,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state, 0, 0); + err = push_insn_history(env, state, 0, 0); if (err) return err; } @@ -18650,6 +18739,10 @@ process_bpf_exit: return err; break; } else { + if (WARN_ON_ONCE(env->cur_state->loop_entry)) { + verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n"); + return -EFAULT; + } do_print_state = true; continue; } @@ -19717,7 +19810,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; + int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; struct bpf_insn *epilogue_buf = env->epilogue_buf; struct bpf_insn *insn_buf = env->insn_buf; @@ -19746,6 +19839,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -ENOMEM; env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1); + if (ret < 0) + return ret; } } @@ -19766,6 +19863,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, insn_buf, cnt - 1); + if (ret < 0) + return ret; } } @@ -20102,6 +20203,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -21938,6 +22040,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; + bool tgt_changes_pkt_data; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -21972,6 +22075,14 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension programs should be JITed\n"); return -EINVAL; } + tgt_changes_pkt_data = aux->func + ? aux->func[subprog]->aux->changes_pkt_data + : aux->changes_pkt_data; + if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) { + bpf_log(log, + "Extension program changes packet data, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22228,6 +22339,33 @@ BTF_ID(func, __rcu_read_unlock) #endif BTF_SET_END(btf_id_deny) +/* fexit and fmod_ret can't be used to attach to __noreturn functions. + * Currently, we must manually list all __noreturn functions here. Once a more + * robust solution is implemented, this workaround can be removed. + */ +BTF_SET_START(noreturn_deny) +#ifdef CONFIG_IA32_EMULATION +BTF_ID(func, __ia32_sys_exit) +BTF_ID(func, __ia32_sys_exit_group) +#endif +#ifdef CONFIG_KUNIT +BTF_ID(func, __kunit_abort) +BTF_ID(func, kunit_try_catch_throw) +#endif +#ifdef CONFIG_MODULES +BTF_ID(func, __module_put_and_kthread_exit) +#endif +#ifdef CONFIG_X86_64 +BTF_ID(func, __x64_sys_exit) +BTF_ID(func, __x64_sys_exit_group) +#endif +BTF_ID(func, do_exit) +BTF_ID(func, do_group_exit) +BTF_ID(func, kthread_complete_and_exit) +BTF_ID(func, kthread_exit) +BTF_ID(func, make_task_dead) +BTF_SET_END(noreturn_deny) + static bool can_be_sleepable(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_TRACING) { @@ -22316,6 +22454,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { return -EINVAL; + } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_MODIFY_RETURN) && + btf_id_set_contains(&noreturn_deny, btf_id)) { + verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n"); + return -EINVAL; } key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); @@ -22437,10 +22580,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - ret = resolve_pseudo_ldimm64(env); if (ret < 0) goto skip_full_check; @@ -22455,6 +22594,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -22591,6 +22734,7 @@ err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); + kvfree(env->insn_hist); err_free_env: kvfree(env); return ret; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4378f3eff25d..5fc2801ee921 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -90,7 +90,7 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif @@ -123,8 +123,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -2339,9 +2362,37 @@ static struct file_system_type cgroup2_fs_type = { }; #ifdef CONFIG_CPUSETS_V1 +enum cpuset_param { + Opt_cpuset_v2_mode, +}; + +static const struct fs_parameter_spec cpuset_fs_parameters[] = { + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + {} +}; + +static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, cpuset_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + return 0; + } + return -EINVAL; +} + static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, + .parse_param = cpuset_parse_param, }; /* @@ -2378,6 +2429,7 @@ static int cpuset_init_fs_context(struct fs_context *fc) static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, + .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif @@ -5512,7 +5564,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5521,7 +5573,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5656,7 +5708,7 @@ err_list_del: err_free_css: list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5889,7 +5941,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6267,8 +6319,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 976a8bc3ff60..383963e28ac6 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -33,6 +33,7 @@ enum prs_errcode { PERR_CPUSEMPTY, PERR_HKEEPING, PERR_ACCESS, + PERR_REMOTE, }; /* bits in struct cpuset flags field */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 839f88ba17f7..25f9565f798d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -62,6 +62,7 @@ static const char * const perr_strings[] = { [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", [PERR_ACCESS] = "Enable partition not permitted", + [PERR_REMOTE] = "Have remote partition underneath", }; /* @@ -266,7 +267,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { - static_branch_enable(&cpusets_insane_config_key); + static_branch_enable_cpuslocked(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); @@ -1099,9 +1100,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* - * Percpu kthreads in top_cpuset are ignored + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). */ - if (kthread_is_per_cpu(task)) + if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { @@ -1768,7 +1771,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - } else if (is_partition_invalid(cs) && + } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; struct cpuset *child; @@ -2825,6 +2828,19 @@ static int update_prstate(struct cpuset *cs, int new_prs) } /* + * We don't support the creation of a new local partition with + * a remote partition underneath it. This unsupported + * setting can happen only if parent is the top_cpuset because + * a remote partition cannot be created underneath an existing + * local or remote partition. + */ + if ((parent == &top_cpuset) && + cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { + err = PERR_REMOTE; + goto out; + } + + /* * If parent is valid partition, enable local partiion. * Otherwise, enable a remote partition. */ @@ -3776,9 +3792,10 @@ retry: partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned - * back to a regular one. + * back to a regular one with a non-empty effective xcpus. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) + else if (is_partition_valid(parent) && is_partition_invalid(cs) && + !cpumask_empty(cs->effective_xcpus)) partcmd = partcmd_update; if (partcmd >= 0) { diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 074653f964c1..c37888b7d25a 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -66,15 +66,9 @@ static struct freezer *parent_freezer(struct freezer *freezer) bool cgroup_freezing(struct task_struct *task) { bool ret; - unsigned int state; rcu_read_lock(); - /* Check if the cgroup is still FREEZING, but not FROZEN. The extra - * !FROZEN check is required, because the FREEZING bit is not cleared - * when the state FROZEN is reached. - */ - state = task_freezer(task)->state; - ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); + ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; @@ -188,13 +182,12 @@ static void freezer_attach(struct cgroup_taskset *tset) if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } + freeze_task(task); } } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 3e01781aeb7b..c4ce2f5a9745 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -323,13 +323,11 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) rcu_read_unlock(); } - /* play nice and yield if necessary */ - if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - __cgroup_rstat_unlock(cgrp, cpu); - if (!cond_resched()) - cpu_relax(); - __cgroup_rstat_lock(cgrp, cpu); - } + /* play nice and avoid disabling interrupts for a long time */ + __cgroup_rstat_unlock(cgrp, cpu); + if (!cond_resched()) + cpu_relax(); + __cgroup_rstat_lock(cgrp, cpu); } } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 055da410ac71..9e5d63efe7c5 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -64,8 +64,7 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes __initconst = - (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M) static phys_addr_t size_cmdline __initdata = -1; static phys_addr_t base_cmdline __initdata; static phys_addr_t limit_cmdline __initdata; @@ -481,8 +480,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) pr_err("Reserved memory: unable to setup CMA region\n"); return err; } - /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(rmem->base, rmem->size); if (default_cma) dma_contiguous_default_area = cma; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f6f0387761d0..39972e834e7a 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -39,6 +39,7 @@ enum { dma_debug_sg, dma_debug_coherent, dma_debug_resource, + dma_debug_noncoherent, }; enum map_err_types { @@ -59,8 +60,7 @@ enum map_err_types { * @direction: enum dma_data_direction * @sg_call_ents: 'nents' from dma_map_sg * @sg_mapped_ents: 'mapped_ents' from dma_map_sg - * @pfn: page frame of the start address - * @offset: offset of mapping relative to pfn + * @paddr: physical start address of the mapping * @map_err_type: track whether dma_mapping_error() was checked * @stack_len: number of backtrace entries in @stack_entries * @stack_entries: stack of backtrace history @@ -74,8 +74,7 @@ struct dma_debug_entry { int direction; int sg_call_ents; int sg_mapped_ents; - unsigned long pfn; - size_t offset; + phys_addr_t paddr; enum map_err_types map_err_type; #ifdef CONFIG_STACKTRACE unsigned int stack_len; @@ -143,6 +142,7 @@ static const char *type2name[] = { [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", + [dma_debug_noncoherent] = "noncoherent", }; static const char *dir2name[] = { @@ -389,14 +389,6 @@ static void hash_bucket_del(struct dma_debug_entry *entry) list_del(&entry->list); } -static unsigned long long phys_addr(struct dma_debug_entry *entry) -{ - if (entry->type == dma_debug_resource) - return __pfn_to_phys(entry->pfn) + entry->offset; - - return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; -} - /* * For each mapping (initial cacheline in the case of * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a @@ -428,8 +420,8 @@ static DEFINE_SPINLOCK(radix_lock); static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry) { - return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) + - (entry->offset >> L1_CACHE_SHIFT); + return ((entry->paddr >> PAGE_SHIFT) << CACHELINE_PER_PAGE_SHIFT) + + (offset_in_page(entry->paddr) >> L1_CACHE_SHIFT); } static int active_cacheline_read_overlap(phys_addr_t cln) @@ -538,11 +530,11 @@ void debug_dma_dump_mappings(struct device *dev) if (!dev || dev == entry->dev) { cln = to_cacheline_number(entry); dev_info(entry->dev, - "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n", + "%s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n", type2name[entry->type], idx, - phys_addr(entry), entry->pfn, - entry->dev_addr, entry->size, - &cln, dir2name[entry->direction], + &entry->paddr, entry->dev_addr, + entry->size, &cln, + dir2name[entry->direction], maperr2str[entry->map_err_type]); } } @@ -569,13 +561,13 @@ static int dump_show(struct seq_file *seq, void *v) list_for_each_entry(entry, &bucket->list, list) { cln = to_cacheline_number(entry); seq_printf(seq, - "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n", + "%s %s %s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n", dev_driver_string(entry->dev), dev_name(entry->dev), type2name[entry->type], idx, - phys_addr(entry), entry->pfn, - entry->dev_addr, entry->size, - &cln, dir2name[entry->direction], + &entry->paddr, entry->dev_addr, + entry->size, &cln, + dir2name[entry->direction], maperr2str[entry->map_err_type]); } spin_unlock_irqrestore(&bucket->lock, flags); @@ -1003,16 +995,17 @@ static void check_unmap(struct dma_debug_entry *ref) "[mapped as %s] [unmapped as %s]\n", ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); - } else if ((entry->type == dma_debug_coherent) && - (phys_addr(ref) != phys_addr(entry))) { + } else if ((entry->type == dma_debug_coherent || + entry->type == dma_debug_noncoherent) && + ref->paddr != entry->paddr) { err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " "[device address=0x%016llx] [size=%llu bytes] " - "[cpu alloc address=0x%016llx] " - "[cpu free address=0x%016llx]", + "[cpu alloc address=0x%pa] " + "[cpu free address=0x%pa]", ref->dev_addr, ref->size, - phys_addr(entry), - phys_addr(ref)); + &entry->paddr, + &ref->paddr); } if (ref->sg_call_ents && ref->type == dma_debug_sg && @@ -1231,8 +1224,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, entry->dev = dev; entry->type = dma_debug_single; - entry->pfn = page_to_pfn(page); - entry->offset = offset; + entry->paddr = page_to_phys(page) + offset; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; @@ -1327,8 +1319,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->type = dma_debug_sg; entry->dev = dev; - entry->pfn = page_to_pfn(sg_page(s)); - entry->offset = s->offset; + entry->paddr = sg_phys(s); entry->size = sg_dma_len(s); entry->dev_addr = sg_dma_address(s); entry->direction = direction; @@ -1374,8 +1365,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, + .paddr = sg_phys(s), .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = dir, @@ -1392,6 +1382,18 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, } } +static phys_addr_t virt_to_paddr(void *virt) +{ + struct page *page; + + if (is_vmalloc_addr(virt)) + page = vmalloc_to_page(virt); + else + page = virt_to_page(virt); + + return page_to_phys(page) + offset_in_page(virt); +} + void debug_dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t dma_addr, void *virt, unsigned long attrs) @@ -1414,16 +1416,11 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, entry->type = dma_debug_coherent; entry->dev = dev; - entry->offset = offset_in_page(virt); + entry->paddr = virt_to_paddr(virt); entry->size = size; entry->dev_addr = dma_addr; entry->direction = DMA_BIDIRECTIONAL; - if (is_vmalloc_addr(virt)) - entry->pfn = vmalloc_to_pfn(virt); - else - entry->pfn = page_to_pfn(virt_to_page(virt)); - add_dma_entry(entry, attrs); } @@ -1433,7 +1430,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size, struct dma_debug_entry ref = { .type = dma_debug_coherent, .dev = dev, - .offset = offset_in_page(virt), .dev_addr = dma_addr, .size = size, .direction = DMA_BIDIRECTIONAL, @@ -1443,10 +1439,7 @@ void debug_dma_free_coherent(struct device *dev, size_t size, if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) return; - if (is_vmalloc_addr(virt)) - ref.pfn = vmalloc_to_pfn(virt); - else - ref.pfn = page_to_pfn(virt_to_page(virt)); + ref.paddr = virt_to_paddr(virt); if (unlikely(dma_debug_disabled())) return; @@ -1469,8 +1462,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, entry->type = dma_debug_resource; entry->dev = dev; - entry->pfn = PHYS_PFN(addr); - entry->offset = offset_in_page(addr); + entry->paddr = addr; entry->size = size; entry->dev_addr = dma_addr; entry->direction = direction; @@ -1547,8 +1539,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, + .paddr = sg_phys(s), .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = direction, @@ -1579,8 +1570,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, + .paddr = sg_phys(sg), .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = direction, @@ -1596,6 +1586,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } } +void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_noncoherent; + entry->dev = dev; + entry->paddr = page_to_phys(page); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + + add_dma_entry(entry, attrs); +} + +void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_noncoherent, + .dev = dev, + .paddr = page_to_phys(page), + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} + static int __init dma_debug_driver_setup(char *str) { int i; diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae..48757ca13f31 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev, extern void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction); +extern void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs); +extern void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, @@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, int nelems, int direction) { } + +static inline void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ +} + +static inline void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} #endif /* CONFIG_DMA_API_DEBUG */ #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 864a1121bf08..c7cc4e33ec6e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -223,6 +223,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && ents != -EIO && ents != -EREMOTEIO)) { + trace_dma_map_sg_err(dev, sg, nents, ents, dir, attrs); return -EIO; } @@ -604,22 +605,29 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, if (WARN_ON_ONCE(flag & __GFP_COMP)) return NULL; - if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) { + trace_dma_alloc(dev, cpu_addr, *dma_handle, size, + DMA_BIDIRECTIONAL, flag, attrs); return cpu_addr; + } /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (dma_alloc_direct(dev, ops)) + if (dma_alloc_direct(dev, ops)) { cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); - else if (use_dma_iommu(dev)) + } else if (use_dma_iommu(dev)) { cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs); - else if (ops->alloc) + } else if (ops->alloc) { cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); - else + } else { + trace_dma_alloc(dev, NULL, 0, size, DMA_BIDIRECTIONAL, flag, + attrs); return NULL; + } - trace_dma_alloc(dev, cpu_addr, *dma_handle, size, flag, attrs); + trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL, + flag, attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs); return cpu_addr; } @@ -641,10 +649,11 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, */ WARN_ON(irqs_disabled()); + trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL, + attrs); if (!cpu_addr) return; - trace_dma_free(dev, cpu_addr, dma_handle, size, attrs); debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); @@ -683,9 +692,11 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); if (page) { - trace_dma_map_page(dev, page_to_phys(page), *dma_handle, size, - dir, 0); - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, + size, dir, gfp, 0); + debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); + } else { + trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } return page; } @@ -708,8 +719,8 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page, void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { - trace_dma_unmap_page(dev, dma_handle, size, dir, 0); - debug_dma_unmap_page(dev, dma_handle, size, dir); + trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); + debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); @@ -768,8 +779,10 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (sgt) { sgt->nents = 1; - trace_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); + trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs); debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); + } else { + trace_dma_alloc_sgt_err(dev, NULL, 0, size, dir, gfp, attrs); } return sgt; } @@ -787,7 +800,7 @@ static void free_single_sgt(struct device *dev, size_t size, void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { - trace_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir, 0); + trace_dma_free_sgt(dev, sgt, size, dir); debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); if (use_dma_iommu(dev)) @@ -897,6 +910,19 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); +static bool __dma_addressing_limited(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < + dma_get_required_mask(dev)) + return true; + + if (unlikely(ops) || use_dma_iommu(dev)) + return false; + return !dma_direct_all_ram_mapped(dev); +} + /** * dma_addressing_limited - return if the device is addressing limited * @dev: device to check @@ -907,15 +933,11 @@ EXPORT_SYMBOL(dma_set_coherent_mask); */ bool dma_addressing_limited(struct device *dev) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < - dma_get_required_mask(dev)) - return true; - - if (unlikely(ops) || use_dma_iommu(dev)) + if (!__dma_addressing_limited(dev)) return false; - return !dma_direct_all_ram_mapped(dev); + + dev_dbg(dev, "device is DMA addressing limited\n"); + return true; } EXPORT_SYMBOL_GPL(dma_addressing_limited); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 7b04f7575796..ee45dee33d49 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -102,8 +102,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, #ifdef CONFIG_DMA_DIRECT_REMAP addr = dma_common_contiguous_remap(page, pool_size, - pgprot_dmacoherent(PAGE_KERNEL), - __builtin_return_address(0)); + pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)), + __builtin_return_address(0)); if (!addr) goto free_page; #else diff --git a/kernel/events/core.c b/kernel/events/core.c index 97af53c43608..3cc06ffb60c1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -206,6 +206,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, __perf_ctx_unlock(&cpuctx->ctx); } +typedef struct { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; +} class_perf_ctx_lock_t; + +static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) +{ perf_ctx_unlock(_T->cpuctx, _T->ctx); } + +static inline class_perf_ctx_lock_t +class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } + #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) @@ -892,13 +905,19 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == NULL) return; - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + /* + * Re-check, could've raced vs perf_remove_from_context(). + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; + + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); @@ -916,7 +935,6 @@ static void perf_cgroup_switch(struct task_struct *task) ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -1195,6 +1213,12 @@ static void perf_assert_pmu_disabled(struct pmu *pmu) WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } +static inline void perf_pmu_read(struct perf_event *event) +{ + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); +} + static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); @@ -2105,8 +2129,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) } static void put_event(struct perf_event *event); -static void event_sched_out(struct perf_event *event, - struct perf_event_context *ctx); +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state); static void perf_put_aux_event(struct perf_event *event) { @@ -2139,8 +2164,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); } } @@ -2198,18 +2222,6 @@ static inline struct list_head *get_event_list(struct perf_event *event) &event->pmu_ctx->flexible_active; } -/* - * Events that have PERF_EV_CAP_SIBLING require being part of a group and - * cannot exist on their own, schedule them out and move them into the ERROR - * state. Also see _perf_event_enable(), it will not be able to recover - * this ERROR state. - */ -static inline void perf_remove_sibling_event(struct perf_event *event) -{ - event_sched_out(event, event->ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); -} - static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; @@ -2245,8 +2257,15 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + /* + * Events that have PERF_EV_CAP_SIBLING require being part of + * a group and cannot exist on their own, schedule them out + * and move them into the ERROR state. Also see + * _perf_event_enable(), it will not be able to recover this + * ERROR state. + */ if (sibling->event_caps & PERF_EV_CAP_SIBLING) - perf_remove_sibling_event(sibling); + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2506,6 +2525,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); } +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state) +{ + event_sched_out(event, ctx); + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, state); +} + /* * Cross CPU call to disable a performance event */ @@ -2520,13 +2548,18 @@ static void __perf_event_disable(struct perf_event *event, perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); + /* + * When disabling a group leader, the whole group becomes ineligible + * to run, so schedule out the full group. + */ if (event == event->group_leader) group_sched_out(event, ctx); - else - event_sched_out(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - perf_cgroup_event_disable(event, ctx); + /* + * But only mark the leader OFF; the siblings will remain + * INACTIVE. + */ + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); perf_pmu_enable(event->pmu_ctx->pmu); } @@ -3482,8 +3515,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -4634,15 +4666,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -6033,6 +6058,9 @@ static int perf_event_set_output(struct perf_event *event, static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -6101,7 +6129,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog, 0); + err = __perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -6589,11 +6617,21 @@ out_put: ring_buffer_put(rb); /* could be last */ } +static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Forbid splitting perf mappings to prevent refcount leaks due to + * the resulting non-matching offsets and sizes. See open()/close(). + */ + return -EINVAL; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, + .may_split = perf_mmap_may_split, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) @@ -6685,9 +6723,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - atomic_set(&rb->aux_mmap_count, 1); user_extra = nr_pages; - goto accounting; } @@ -6789,8 +6825,10 @@ accounting: } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); - if (!ret) + if (!ret) { + atomic_set(&rb->aux_mmap_count, 1); rb->aux_mmap_locked = extra; + } } unlock: @@ -6800,6 +6838,7 @@ unlock: atomic_inc(&event->mmap_count); } else if (rb) { + /* AUX allocation failed */ atomic_dec(&rb->mmap_count); } aux_unlock: @@ -6807,6 +6846,9 @@ aux_unlock: mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); + if (ret) + return ret; + /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. @@ -7096,6 +7138,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -7408,9 +7454,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) @@ -7423,9 +7468,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) @@ -7484,6 +7528,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) @@ -7806,6 +7853,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + if (!current->mm) + user = false; + if (!kernel && !user) return &__empty_callchain; @@ -9716,14 +9766,14 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle && - hwc->interrupts > max_samples_per_tick)) { - __this_cpu_inc(perf_throttled_count); - tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } + } + + if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; } if (event->attr.freq) { @@ -10701,7 +10751,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!perfmon_capable()) + if (!capable(CAP_SYS_ADMIN)) return -EACCES; /* @@ -10757,8 +10807,9 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, - u64 bpf_cookie) +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; @@ -10796,6 +10847,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { @@ -10815,7 +10880,15 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -ENOENT; +} + +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; @@ -11978,40 +12051,51 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); - if (!ret) { - if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && - has_extended_regs(event)) - ret = -EOPNOTSUPP; + if (ret) + goto err_pmu; - if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) - ret = -EINVAL; + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) { + ret = -EOPNOTSUPP; + goto err_destroy; + } - if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { - const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); - struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); - int cpu; - - if (pmu_cpumask && cpumask) { - cpu = cpumask_any_and(pmu_cpumask, cpumask); - if (cpu >= nr_cpu_ids) - ret = -ENODEV; - else - event->event_caps |= PERF_EV_CAP_READ_SCOPE; - } else { - ret = -ENODEV; - } - } + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + ret = -EINVAL; + goto err_destroy; + } + + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask; + struct cpumask *pmu_cpumask; + int cpu; - if (ret && event->destroy) - event->destroy(event); + cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + pmu_cpumask = perf_scope_cpumask(pmu->scope); + + ret = -ENODEV; + if (!pmu_cpumask || !cpumask) + goto err_destroy; + + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + goto err_destroy; + + event->event_caps |= PERF_EV_CAP_READ_SCOPE; } - if (ret) { - event->pmu = NULL; - module_put(pmu->module); + return 0; + +err_destroy: + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; } +err_pmu: + event->pmu = NULL; + module_put(pmu->module); return ret; } @@ -13661,6 +13745,9 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + get_ctx(child_ctx); + child_event->ctx = child_ctx; + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); @@ -13683,8 +13770,6 @@ inherit_event(struct perf_event *parent_event, return NULL; } - get_ctx(child_ctx); - /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, @@ -13705,7 +13790,6 @@ inherit_event(struct perf_event *parent_event, local64_set(&hwc->period_left, sample_period); } - child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6c2cb4e4f48d..8f3f624419aa 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index bbfa22c0a159..6ecbbc57cdfd 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { diff --git a/kernel/exit.c b/kernel/exit.c index 619f0014c33b..d465b36bcc86 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -742,10 +742,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_state = EXIT_ZOMBIE; /* - * sub-thread or delay_group_leader(), wake up the - * PIDFD_THREAD waiters. + * Ignore thread-group leaders that exited before all + * subthreads did. */ - if (!thread_group_empty(tsk)) + if (!delay_group_leader(tsk)) do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { @@ -923,6 +923,15 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); + exit_mm(); if (group_dead) @@ -939,14 +948,6 @@ void __noreturn do_exit(long code) exit_task_work(tsk); exit_thread(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - sched_autogroup_exit_task(tsk); cgroup_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 12decadff468..97c9afe3efc3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -505,10 +505,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); - /* track_pfn_copy() will later take care of copying internal state. */ - if (unlikely(new->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(new); - return new; } @@ -699,6 +695,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; + + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(tmp->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(tmp); + retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; diff --git a/kernel/freezer.c b/kernel/freezer.c index 8d530d0949ff..6a96149aede9 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -201,18 +201,9 @@ static int __restore_freezer_state(struct task_struct *p, void *arg) void __thaw_task(struct task_struct *p) { - unsigned long flags; - - spin_lock_irqsave(&freezer_lock, flags); - if (WARN_ON_ONCE(freezing(p))) - goto unlock; - - if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) - goto unlock; - - wake_up_state(p, TASK_FROZEN); -unlock: - spin_unlock_irqrestore(&freezer_lock, flags); + guard(spinlock_irqsave)(&freezer_lock); + if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL)) + wake_up_state(p, TASK_FROZEN); } /** diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 1a3d483548e2..ae4c9cbd1b4b 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode, void *data) { struct irq_sim_work_ctx *work_ctx __free(kfree) = - kmalloc(sizeof(*work_ctx), GFP_KERNEL); + kzalloc(sizeof(*work_ctx), GFP_KERNEL); if (!work_ctx) return ERR_PTR(-ENOMEM); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index a83fdc053c2b..6d62fc7c1dfa 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1146,7 +1146,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; - if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + if (info->flags & MSI_FLAG_NO_MASK) return false; /* diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 117d9d4d3c3b..5121d9a6dd3b 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -533,7 +533,7 @@ static void test_barrier_nothreads(struct kunit *test) struct kcsan_scoped_access *reorder_access = NULL; #endif arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; - atomic_t dummy; + atomic_t dummy = ATOMIC_INIT(0); KCSAN_TEST_REQUIRES(test, reorder_access != NULL); KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP)); diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 7c6588148d42..0c746a150e34 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -231,6 +231,7 @@ comment "Do not forget to sign required modules with scripts/sign-file" choice prompt "Hash algorithm to sign modules" depends on MODULE_SIG || IMA_APPRAISE_MODSIG + default MODULE_SIG_SHA512 help This determines which sort of hashing algorithm will be used during signature generation. This algorithm _must_ be built into the kernel diff --git a/kernel/module/main.c b/kernel/module/main.c index 93a07387af3b..4511d0a4762a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -703,14 +703,16 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, struct module *mod; char name[MODULE_NAME_LEN]; char buf[MODULE_FLAGS_BUF_SIZE]; - int ret, forced = 0; + int ret, len, forced = 0; if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) - return -EFAULT; - name[MODULE_NAME_LEN-1] = '\0'; + len = strncpy_from_user(name, name_user, MODULE_NAME_LEN); + if (len == 0 || len == MODULE_NAME_LEN) + return -ENOENT; + if (len < 0) + return len; audit_log_kern_module(name); @@ -2898,7 +2900,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_allocated = true; - audit_log_kern_module(mod->name); + audit_log_kern_module(info->name); /* Reserve our place in the list. */ err = add_unformed_module(mod); @@ -3058,8 +3060,10 @@ static int load_module(struct load_info *info, const char __user *uargs, * failures once the proper module was allocated and * before that. */ - if (!module_allocated) + if (!module_allocated) { + audit_log_kern_module(info->name ? info->name : "?"); mod_stat_bump_becoming(info, flags); + } free_copy(info, flags); return err; } diff --git a/kernel/padata.c b/kernel/padata.c index 22770372bdf3..3e0ef0753e73 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -358,7 +358,8 @@ static void padata_reorder(struct parallel_data *pd) * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. */ padata_get_pd(pd); - queue_work(pinst->serial_wq, &pd->reorder_work); + if (!queue_work(pinst->serial_wq, &pd->reorder_work)) + padata_put_pd(pd); } } diff --git a/kernel/panic.c b/kernel/panic.c index fbc59b3b64d0..ddad0578355b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -832,9 +832,15 @@ device_initcall(register_warn_debugfs); */ __visible noinstr void __stack_chk_fail(void) { + unsigned long flags; + instrumentation_begin(); + flags = user_access_save(); + panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + + user_access_restore(flags); instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/params.c b/kernel/params.c index 2e447f8ae183..9935ff599356 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -763,7 +763,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static struct module_kobject * __init locate_module_kobject(const char *name) +struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; @@ -805,7 +805,7 @@ static void __init kernel_add_sysfs_param(const char *name, struct module_kobject *mk; int err; - mk = locate_module_kobject(name); + mk = lookup_or_create_module_kobject(name); if (!mk) return; @@ -876,7 +876,7 @@ static void __init version_sysfs_builtin(void) int err; for (vattr = __start___modver; vattr < __stop___modver; vattr++) { - mk = locate_module_kobject(vattr->module_name); + mk = lookup_or_create_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); @@ -949,7 +949,9 @@ struct kset *module_kset; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); - complete(mk->kobj_completion); + + if (mk->kobj_completion) + complete(mk->kobj_completion); } const struct kobj_type module_ktype = { diff --git a/kernel/power/console.c b/kernel/power/console.c index fcdf0e14a47d..19c48aa5355d 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -16,6 +16,7 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static bool vt_switch_done; static DEFINE_MUTEX(vt_switch_mutex); @@ -136,17 +137,21 @@ void pm_prepare_console(void) if (orig_fgconsole < 0) return; + vt_switch_done = true; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return; } void pm_restore_console(void) { - if (!pm_vt_switch()) + if (!pm_vt_switch() && !vt_switch_done) return; if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); } + + vt_switch_done = false; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 927cc55ba0b3..1c9fe741fe6d 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif -static void em_destroy_table_rcu(struct rcu_head *rp) -{ - struct em_perf_table __rcu *table; - - table = container_of(rp, struct em_perf_table, rcu); - kfree(table); -} - static void em_release_table_kref(struct kref *kref) { - struct em_perf_table __rcu *table; - /* It was the last owner of this table so we can free */ - table = container_of(kref, struct em_perf_table, kref); - - call_rcu(&table->rcu, em_destroy_table_rcu); + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); } /** @@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref) * * No return values. */ -void em_table_free(struct em_perf_table __rcu *table) +void em_table_free(struct em_perf_table *table) { kref_put(&table->kref, em_release_table_kref); } @@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table) * has a user. * Returns allocated table or NULL. */ -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { - struct em_perf_table __rcu *table; + struct em_perf_table *table; int table_size; table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; @@ -245,6 +233,10 @@ static int em_compute_costs(struct device *dev, struct em_perf_state *table, unsigned long prev_cost = ULONG_MAX; int i, ret; + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return 0; + /* Compute the cost of each performance state. */ for (i = nr_states - 1; i >= 0; i--) { unsigned long power_res, cost; @@ -308,9 +300,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, * Return 0 on success or an error code on failure. */ int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { - struct em_perf_table __rcu *old_table; + struct em_perf_table *old_table; struct em_perf_domain *pd; if (!dev) @@ -327,7 +319,8 @@ int em_dev_update_perf_domain(struct device *dev, kref_get(&new_table->kref); - old_table = pd->em_table; + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); rcu_assign_pointer(pd->em_table, new_table); em_cpufreq_update_efficiencies(dev, new_table->state); @@ -399,7 +392,7 @@ static int em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, cpumask_t *cpus, unsigned long flags) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; int cpu, ret, num_cpus; @@ -559,6 +552,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *cpus, bool microwatts) { + struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; int cpu, ret; @@ -629,7 +623,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, dev->em_pd->flags |= flags; - em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); @@ -666,7 +662,8 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - em_table_free(dev->em_pd->em_table); + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); kfree(dev->em_pd); dev->em_pd = NULL; @@ -674,9 +671,9 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_state *ps, *new_ps; int ps_size; @@ -698,7 +695,7 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) } static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, - struct em_perf_table __rcu *em_table) + struct em_perf_table *em_table) { int ret; @@ -729,7 +726,7 @@ static void em_adjust_new_capacity(struct device *dev, struct em_perf_domain *pd, u64 max_cap) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; em_table = em_table_dup(pd); if (!em_table) { @@ -820,7 +817,7 @@ static void em_update_workfn(struct work_struct *work) */ int em_dev_update_chip_binning(struct device *dev) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; int i, ret; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index d8bad1eeedd3..85008ead2ac9 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -89,6 +89,11 @@ void hibernate_release(void) atomic_inc(&hibernate_atomic); } +bool hibernation_in_progress(void) +{ + return !atomic_read(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && diff --git a/kernel/power/main.c b/kernel/power/main.c index 6254814d4817..0622e7dacf17 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -613,7 +613,8 @@ bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { - return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; + return pm_debug_messages_on && (hibernation_in_progress() || + pm_suspend_target_state != PM_SUSPEND_ON); } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); diff --git a/kernel/power/power.h b/kernel/power/power.h index de0e6b1077f2..6d1ec7b23e84 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -71,10 +71,14 @@ extern void enable_restore_image_protection(void); static inline void enable_restore_image_protection(void) {} #endif /* CONFIG_STRICT_KERNEL_RWX */ +extern bool hibernation_in_progress(void); + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} + +static inline bool hibernation_in_progress(void) { return false; } #endif /* !CONFIG_HIBERNATION */ #define power_attr(_name) \ diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 52571dcad768..4e941999a53b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -49,6 +49,9 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) len += sysfs_emit_at(buf, len, "%s ", wl->name); } + if (len > 0) + --len; + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fd12efcc4aed..e7a3af81b173 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -214,8 +214,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) /** * nbcon_context_try_acquire_direct - Try to acquire directly - * @ctxt: The context of the caller - * @cur: The current console state + * @ctxt: The context of the caller + * @cur: The current console state + * @is_reacquire: This acquire is a reacquire * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. @@ -225,17 +226,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) * * Errors: * - * -EPERM: A panic is in progress and this is not the panic CPU. - * Or the current owner or waiter has the same or higher - * priority. No acquire method can be successful in - * this case. + * -EPERM: A panic is in progress and this is neither the panic + * CPU nor is this a reacquire. Or the current owner or + * waiter has the same or higher priority. No acquire + * method can be successful in these cases. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, - struct nbcon_state *cur) + struct nbcon_state *cur, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -243,14 +244,20 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, do { /* - * Panic does not imply that the console is owned. However, it - * is critical that non-panic CPUs during panic are unable to - * acquire ownership in order to satisfy the assumptions of - * nbcon_waiter_matches(). In particular, the assumption that - * lower priorities are ignored during panic. + * Panic does not imply that the console is owned. However, + * since all non-panic CPUs are stopped during panic(), it + * is safer to have them avoid gaining console ownership. + * + * If this acquire is a reacquire (and an unsafe takeover + * has not previously occurred) then it is allowed to attempt + * a direct acquire in panic. This gives console drivers an + * opportunity to perform any necessary cleanup if they were + * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic()) + if (other_cpu_in_panic() && + (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; + } if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; @@ -301,8 +308,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. - * Events #4 and #5 are not possible due to the other_cpu_in_panic() - * check in nbcon_context_try_acquire_direct(). + * Event #4 occurs when a non-panic CPU reacquires. + * Event #5 is not possible due to the other_cpu_in_panic() check + * in nbcon_context_try_acquire_handover(). */ return (cur->req_prio == expected_prio); @@ -431,6 +439,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); + /* + * Panic does not imply that the console is owned. However, it + * is critical that non-panic CPUs during panic are unable to + * wait for a handover in order to satisfy the assumptions of + * nbcon_waiter_matches(). In particular, the assumption that + * lower priorities are ignored during panic. + */ + if (other_cpu_in_panic()) + return -EPERM; + /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; @@ -558,7 +576,8 @@ static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console - * @ctxt: The context of the caller + * @ctxt: The context of the caller + * @is_reacquire: This acquire is a reacquire * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. @@ -568,7 +587,7 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) +static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -577,7 +596,7 @@ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) nbcon_state_read(con, &cur); try_again: - err = nbcon_context_try_acquire_direct(ctxt, &cur); + err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire); if (err != -EBUSY) goto out; @@ -913,7 +932,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); - while (!nbcon_context_try_acquire(ctxt)) + while (!nbcon_context_try_acquire(ctxt, true)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); @@ -1101,7 +1120,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) cant_migrate(); } - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) goto out; /* @@ -1486,7 +1505,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = allow_unsafe_takeover; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return -EPERM; while (nbcon_seq_read(con) < stop_seq) { @@ -1762,7 +1781,7 @@ bool nbcon_device_try_acquire(struct console *con) ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 881a26e18c65..3a91b739e8f3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3310,7 +3310,12 @@ void console_unblank(void) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) { found_unblank = true; break; } @@ -3347,7 +3352,12 @@ void console_unblank(void) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..f87c9d6d36fc 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); } /* diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 0db9db73f57f..36b78d5a0675 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -81,7 +81,7 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, // Number of typesafe_lookup structures, that is, the degree of concurrency. torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures."); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000, "Number of loops per experiment."); +torture_param(int, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. @@ -1029,7 +1029,7 @@ static void ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } @@ -1126,12 +1126,16 @@ ref_scale_init(void) // Reader tasks (default to ~75% of online CPUs). if (nreaders < 0) nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); - if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops)) + if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops)) loops = 1; if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) nreaders = 1; if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) nruns = 1; + if (WARN_ONCE(loops > INT_MAX / nreaders, + "%s: nreaders * loops will overflow, adjusted loops to %d", + __func__, INT_MAX / nreaders)) + loops = INT_MAX / nreaders; reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e52c1dd0628..7b073b8b5e91 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -802,6 +802,10 @@ static int rcu_watching_snap_save(struct rcu_data *rdp) return 0; } +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif + /* * Returns positive if the specified CPU has passed through a quiescent state * by virtue of being in or having passed through an dynticks idle state since @@ -937,9 +941,9 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp) rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); - rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); - rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); - rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu); + rsrp->nr_csw = nr_context_switches_cpu(cpu); rsrp->jiffies = jiffies; rsrp->gp_seq = rdp->gp_seq; } @@ -1822,10 +1826,14 @@ static noinline_for_stack bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); @@ -3068,6 +3076,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + /* Avoid NULL dereference if callback is NULL. */ + if (WARN_ON_ONCE(!func)) + return; + if (debug_rcu_head_queue(head)) { /* * Probable double call_rcu(), so leak the callback. @@ -4183,14 +4195,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); @@ -5012,6 +5027,8 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcu_preempt_deferred_qs_init(rdp); rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a9a811d9d7a3..8ba04b179416 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -168,12 +168,23 @@ struct rcu_snap_record { u64 cputime_irq; /* Accumulated cputime of hard irqs */ u64 cputime_softirq;/* Accumulated cputime of soft irqs */ u64 cputime_system; /* Accumulated cputime of kernel tasks */ - unsigned long nr_hardirqs; /* Accumulated number of hard irqs */ + u64 nr_hardirqs; /* Accumulated number of hard irqs */ unsigned int nr_softirqs; /* Accumulated number of soft irqs */ unsigned long long nr_csw; /* Accumulated number of task switches */ unsigned long jiffies; /* Track jiffies value */ }; +/* + * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention. + * to report quiescent states at the soonest possible time. + * The request can be in one of the following states: + * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled. + * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it + * ran and we still haven't reported a quiescent state. + */ +#define DEFER_QS_IDLE 0 +#define DEFER_QS_PENDING 1 + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -191,7 +202,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ - bool defer_qs_iw_pending; /* Scheduler attention pending? */ + int defer_qs_iw_pending; /* Scheduler attention pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ @@ -476,6 +487,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2605dd234a13..a1a5942d7017 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -276,7 +276,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, * callback storms, no need to wake up too early. */ if (waketype == RCU_NOCB_WAKE_LAZY && - rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { + rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { @@ -1152,7 +1152,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(cpu_online(rdp->cpu)); /* @@ -1162,7 +1161,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) if (!rdp->nocb_gp_rdp) return -EINVAL; - if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + if (WARN_ON_ONCE(!rdp->nocb_gp_kthread)) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); @@ -1172,7 +1171,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); + wake_up_process(rdp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_nocb_rdp_offload_wait_cond(rdp)); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1c7cbd145d5e..2d865b2096be 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -485,13 +485,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) struct rcu_node *rnp; union rcu_special special; + rdp = this_cpu_ptr(&rcu_data); + if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) + rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - rdp = this_cpu_ptr(&rcu_data); if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; @@ -623,10 +626,29 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) */ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) { + unsigned long flags; struct rcu_data *rdp; rdp = container_of(iwp, struct rcu_data, defer_qs_iw); - rdp->defer_qs_iw_pending = false; + local_irq_save(flags); + + /* + * If the IRQ work handler happens to run in the middle of RCU read-side + * critical section, it could be ineffective in getting the scheduler's + * attention to report a deferred quiescent state (the whole point of the + * IRQ work). For this reason, requeue the IRQ work. + * + * Basically, we want to avoid following situation: + * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING) + * 2. CPU enters new rcu_read_lock() + * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0 + * 4. rcu_read_unlock() does not re-queue work (state still PENDING) + * 5. Deferred QS reporting does not happen. + */ + if (rcu_preempt_depth() > 0) + WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); + + local_irq_restore(flags); } /* @@ -672,17 +694,11 @@ static void rcu_read_unlock_special(struct task_struct *t) set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { + expboost && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - IS_ENABLED(CONFIG_PREEMPT_RT)) - rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( - rcu_preempt_deferred_qs_handler); - else - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); - rdp->defer_qs_iw_pending = true; + rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } @@ -821,6 +837,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) } } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) +{ + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); +} #else /* #ifdef CONFIG_PREEMPT_RCU */ /* @@ -832,8 +852,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); @@ -974,13 +1003,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs @@ -1008,6 +1040,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { } + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4432db6d0b99..4d524a2212a8 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -457,8 +457,8 @@ static void print_cpu_stat_info(int cpu) rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); pr_err("\t hardirqs softirqs csw/system\n"); - pr_err("\t number: %8ld %10d %12lld\n", - kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, + pr_err("\t number: %8lld %10d %12lld\n", + kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs, kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, nr_context_switches_cpu(cpu) - rsrp->nr_csw); pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n", diff --git a/kernel/resource.c b/kernel/resource.c index 4101016e8b20..1d48ae864635 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1268,8 +1268,9 @@ static int __request_region_locked(struct resource *res, struct resource *parent * become unavailable to other users. Conflicts are * not expected. Warn to aid debugging if encountered. */ - if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { - pr_warn("Unaddressable device %s %pR conflicts with %pR", + if (parent == &iomem_resource && + conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_warn("Unaddressable device %s %pR conflicts with %pR\n", conflict->name, conflict, res); } if (conflict != parent) { diff --git a/kernel/rseq.c b/kernel/rseq.c index 9de6e35fe679..23894ba8250c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -149,6 +149,29 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) return 0; } +/* + * Get the user-space pointer value stored in the 'rseq_cs' field. + */ +static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +{ + if (!rseq_cs) + return -EFAULT; + +#ifdef CONFIG_64BIT + if (get_user(*rseq_cs, &rseq->rseq_cs)) + return -EFAULT; +#else + if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) + return -EFAULT; +#endif + + return 0; +} + +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; @@ -157,17 +180,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; -#ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) - return -EFAULT; -#endif + ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + if (ret) + return ret; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } + /* Check that the pointer value fits in the user-space process space. */ if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; @@ -243,7 +265,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) return !!event_mask; } -static int clear_rseq_cs(struct task_struct *t) +static int clear_rseq_cs(struct rseq __user *rseq) { /* * The rseq_cs field is set to NULL on preemption or signal @@ -254,9 +276,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) + if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) return -EFAULT; return 0; #endif @@ -288,11 +310,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); + return clear_rseq_cs(t->rseq); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; - ret = clear_rseq_cs(t); + ret = clear_rseq_cs(t->rseq); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -366,6 +388,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; + u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -420,6 +443,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; + + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. + */ + if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) + return -EFAULT; + if (rseq_cs && clear_rseq_cs(rseq)) + return -EFAULT; + current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9bb1b4c5842..4b1953b6c76a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1303,7 +1303,7 @@ bool sched_can_stop_tick(struct rq *rq) if (scx_enabled() && !scx_can_stop_tick(rq)) return false; - if (rq->cfs.h_nr_running > 1) + if (rq->cfs.h_nr_queued > 1) return false; /* @@ -2229,6 +2229,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * just go back and repeat. */ rq = task_rq_lock(p, &rf); + /* + * If task is sched_delayed, force dequeue it, to avoid always + * hitting the tick timeout in the queued case + */ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); trace_sched_wait_task(p); running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); @@ -3885,6 +3891,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) if (task_on_scx(p)) return false; +#ifdef CONFIG_SMP + if (p->sched_class == &stop_sched_class) + return false; +#endif + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. @@ -5970,7 +5981,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * opportunity to pull in more work from other CPUs. */ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && - rq->nr_running == rq->cfs.h_nr_running)) { + rq->nr_running == rq->cfs.h_nr_queued)) { p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) @@ -6517,12 +6528,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * Otherwise marks the task's __state as RUNNING */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long task_state) + unsigned long *task_state_p) { + unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); + *task_state_p = TASK_RUNNING; return false; } @@ -6656,7 +6669,7 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, prev_state); + try_to_block_task(rq, prev, &prev_state); switch_count = &prev->nvcsw; } @@ -8415,7 +8428,7 @@ void __init sched_init(void) init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - root_task_group.scx_weight = CGROUP_WEIGHT_DFL; + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -8855,7 +8868,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; - scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); + scx_tg_init(tg); alloc_uclamp_sched_group(tg, parent); return tg; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e51d5ce730be..e5ced97d9681 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -81,9 +81,20 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); + sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + return true; } @@ -95,10 +106,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) + if (sg_policy->need_freq_update) { sg_policy->need_freq_update = false; - else if (sg_policy->next_freq == next_freq) + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -365,7 +388,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) - sg_cpu->sg_policy->limits_changed = true; + WRITE_ONCE(sg_cpu->sg_policy->limits_changed, true); } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, @@ -888,7 +911,16 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } struct cpufreq_governor schedutil_gov = { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5e7ae404c8d2..53e3670fbb1e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1485,7 +1485,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 if (dl_entity_is_special(dl_se)) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + scaled_delta_exec = delta_exec; + if (!dl_server(dl_se)) + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); dl_se->runtime -= scaled_delta_exec; @@ -1592,7 +1594,7 @@ throttle: */ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) { - s64 delta_exec, scaled_delta_exec; + s64 delta_exec; if (!rq->fair_server.dl_defer) return; @@ -1605,9 +1607,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) if (delta_exec < 0) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); - - rq->fair_server.runtime -= scaled_delta_exec; + rq->fair_server.runtime -= delta_exec; if (rq->fair_server.runtime < 0) { rq->fair_server.dl_defer_running = 0; @@ -3227,6 +3227,9 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + for_each_possible_cpu(cpu) + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + for_each_possible_cpu(cpu) { rcu_read_lock_sched(); @@ -3242,7 +3245,6 @@ void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1e3bc0774efd..9815f9a0cd59 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -378,7 +378,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - if (rq->cfs.h_nr_running) { + if (rq->cfs.h_nr_queued) { update_rq_clock(rq); dl_server_stop(&rq->fair_server); } @@ -391,7 +391,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", cpu_of(rq)); - if (rq->cfs.h_nr_running) + if (rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); } @@ -843,7 +843,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", cfs_rq->idle_nr_running); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index fcf968490308..563a7dc2ece6 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3918,6 +3918,11 @@ static void scx_cgroup_warn_missing_idle(struct task_group *tg) cgroup_warned_missing_idle = true; } +void scx_tg_init(struct task_group *tg) +{ + tg->scx_weight = CGROUP_WEIGHT_DFL; +} + int scx_tg_online(struct task_group *tg) { int ret = 0; @@ -4053,12 +4058,12 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) { percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && tg->scx_weight != weight) { - if (SCX_HAS_OP(cgroup_set_weight)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, - tg_cgrp(tg), weight); - tg->scx_weight = weight; - } + if (scx_cgroup_enabled && SCX_HAS_OP(cgroup_set_weight) && + tg->scx_weight != weight) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, + tg_cgrp(tg), weight); + + tg->scx_weight = weight; percpu_up_read(&scx_cgroup_rwsem); } @@ -4530,7 +4535,7 @@ unlock: static void free_exit_info(struct scx_exit_info *ei) { - kfree(ei->dump); + kvfree(ei->dump); kfree(ei->msg); kfree(ei->bt); kfree(ei); @@ -4546,7 +4551,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); - ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); + ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); if (!ei->bt || !ei->msg || !ei->dump) { free_exit_info(ei); @@ -5215,6 +5220,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) for_each_possible_cpu(cpu) cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { + reset_idle_masks(); + static_branch_enable(&scx_builtin_idle_enabled); + } else { + static_branch_disable(&scx_builtin_idle_enabled); + } + /* * Keep CPUs stable during enable so that the BPF scheduler can track * online CPUs by watching ->on/offline_cpu() after ->init(). @@ -5282,13 +5294,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { - reset_idle_masks(); - static_branch_enable(&scx_builtin_idle_enabled); - } else { - static_branch_disable(&scx_builtin_idle_enabled); - } - /* * Lock out forks, cgroup on/offlining and moves before opening the * floodgate so that they don't wander into the operations prematurely. @@ -5367,6 +5372,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (!tryget_task_struct(p)) + continue; + if (old_class != new_class && p->se.sched_delayed) dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); @@ -5379,6 +5387,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) sched_enq_and_set_task(&ctx); check_class_changed(task_rq(p), p, old_class, p->prio); + put_task_struct(p); } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); @@ -6731,6 +6740,12 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + /* + * next() and destroy() will be called regardless of the return value. + * Always clear $kit->dsq. + */ + kit->dsq = NULL; + if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 1079b56b0f7a..67032c30c754 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -70,6 +70,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_EXT_GROUP_SCHED +void scx_tg_init(struct task_group *tg); int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); @@ -79,6 +80,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); #else /* CONFIG_EXT_GROUP_SCHED */ +static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ceb023629d48..af61769b1d50 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -71,10 +71,10 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -2147,7 +2147,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); ns->util += cpu_util_cfs(cpu); - ns->nr_running += rq->cfs.h_nr_running; + ns->nr_running += rq->cfs.h_nr_queued; ns->compute_capacity += capacity_of(cpu); if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { @@ -5427,7 +5427,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_queued of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -5511,6 +5511,7 @@ static void set_delayed(struct sched_entity *se) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_runnable--; cfs_rq->h_nr_delayed++; if (cfs_rq_throttled(cfs_rq)) break; @@ -5533,6 +5534,7 @@ static void clear_delayed(struct sched_entity *se) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_runnable++; cfs_rq->h_nr_delayed--; if (cfs_rq_throttled(cfs_rq)) break; @@ -5583,7 +5585,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_queued of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. @@ -5985,8 +5987,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta, dequeue = 1; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, runnable_delta, idle_task_delta, delayed_delta, dequeue = 1; + long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -6016,7 +6018,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; idle_task_delta = cfs_rq->idle_h_nr_running; delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { @@ -6038,9 +6041,10 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->h_nr_delayed -= delayed_delta; @@ -6061,18 +6065,19 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->h_nr_delayed -= delayed_delta; } /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, task_delta); + sub_nr_running(rq, queued_delta); /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); done: /* @@ -6091,8 +6096,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, runnable_delta, idle_task_delta, delayed_delta; + long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6125,7 +6130,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } - task_delta = cfs_rq->h_nr_running; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; idle_task_delta = cfs_rq->idle_h_nr_running; delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { @@ -6141,9 +6147,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; qcfs_rq->h_nr_delayed += delayed_delta; @@ -6159,9 +6166,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; qcfs_rq->h_nr_delayed += delayed_delta; @@ -6171,11 +6179,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_running && rq->cfs.h_nr_running) + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); + add_nr_running(rq, queued_delta); unthrottle_throttle: assert_list_leaf_cfs_rq(rq); @@ -6890,7 +6898,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) SCHED_WARN_ON(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { + if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; u64 slice = se->slice; s64 delta = slice - ran; @@ -7033,7 +7041,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); int h_nr_delayed = 0; int task_new = !(flags & ENQUEUE_WAKEUP); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; /* @@ -7081,7 +7089,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) enqueue_entity(cfs_rq, se, flags); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; + if (!h_nr_delayed) + cfs_rq->h_nr_runnable++; + cfs_rq->h_nr_queued++; cfs_rq->idle_h_nr_running += idle_h_nr_running; cfs_rq->h_nr_delayed += h_nr_delayed; @@ -7107,7 +7117,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; + if (!h_nr_delayed) + cfs_rq->h_nr_runnable++; + cfs_rq->h_nr_queued++; cfs_rq->idle_h_nr_running += idle_h_nr_running; cfs_rq->h_nr_delayed += h_nr_delayed; @@ -7119,7 +7131,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) goto enqueue_throttle; } - if (!rq_h_nr_running && rq->cfs.h_nr_running) { + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { /* Account for idle runtime */ if (!rq->nr_running) dl_server_update_idle_time(rq, rq->curr); @@ -7166,25 +7178,22 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; int idle_h_nr_running = 0; - int h_nr_running = 0; + int h_nr_queued = 0; int h_nr_delayed = 0; struct cfs_rq *cfs_rq; u64 slice = 0; if (entity_is_task(se)) { p = task_of(se); - h_nr_running = 1; + h_nr_queued = 1; idle_h_nr_running = task_has_idle_policy(p); if (!task_sleep && !task_delayed) h_nr_delayed = !!se->sched_delayed; - } else { - cfs_rq = group_cfs_rq(se); - slice = cfs_rq_min_slice(cfs_rq); } for_each_sched_entity(se) { @@ -7194,15 +7203,18 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (p && &p->se == se) return -1; + slice = cfs_rq_min_slice(cfs_rq); break; } - cfs_rq->h_nr_running -= h_nr_running; + if (!h_nr_delayed) + cfs_rq->h_nr_runnable -= h_nr_queued; + cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->idle_h_nr_running -= idle_h_nr_running; cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + idle_h_nr_running = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7238,21 +7250,23 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running -= h_nr_running; + if (!h_nr_delayed) + cfs_rq->h_nr_runnable -= h_nr_queued; + cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->idle_h_nr_running -= idle_h_nr_running; cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + idle_h_nr_running = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) return 0; } - sub_nr_running(rq, h_nr_running); + sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ @@ -7299,6 +7313,11 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) return true; } +static inline unsigned int cfs_h_nr_delayed(struct rq *rq) +{ + return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); +} + #ifdef CONFIG_SMP /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ @@ -7460,8 +7479,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; - if (sync && cpu_rq(this_cpu)->nr_running == 1) - return this_cpu; + if (sync) { + struct rq *rq = cpu_rq(this_cpu); + + if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1) + return this_cpu; + } if (available_idle_cpu(prev_cpu)) return prev_cpu; @@ -10396,7 +10419,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * When there is more than 1 task, the group_overloaded case already * takes care of cpu with reduced capacity */ - if (rq->cfs.h_nr_running != 1) + if (rq->cfs.h_nr_queued != 1) return false; return check_cpu_capacity(rq, sd); @@ -10431,7 +10454,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_queued; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; @@ -10746,7 +10769,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_queued - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -11528,7 +11551,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, if (rt > env->fbq_type) continue; - nr_running = rq->cfs.h_nr_running; + nr_running = rq->cfs.h_nr_queued; if (!nr_running) continue; @@ -11687,7 +11710,7 @@ static int need_active_balance(struct lb_env *env) * available on dst_cpu. */ if (env->idle && - (env->src_rq->cfs.h_nr_running == 1)) { + (env->src_rq->cfs.h_nr_queued == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -12197,8 +12220,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. + * + * sched_balance_newidle() bumps the cost whenever newidle + * balance fails, and we don't want things to grow out of + * control. Use the sysctl_sched_migration_cost as the upper + * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = cost; + sd->max_newidle_lb_cost = + min(cost, sysctl_sched_migration_cost + 200); sd->last_decay_max_lb_cost = jiffies; } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { /* @@ -12430,7 +12459,7 @@ static void nohz_balancer_kick(struct rq *rq) * If there's a runnable CFS task and the current CPU has reduced * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { + if (rq->cfs.h_nr_queued >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12903,10 +12932,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Failing newidle means it is not effective; + * bump the cost so we end up doing less of it. + */ + if (!pulled_task) + domain_cost = (3 * sd->max_newidle_lb_cost) / 2; + + update_newidle_cost(sd, domain_cost); } /* @@ -12928,11 +12964,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) + if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) + if (this_rq->nr_running != this_rq->cfs.h_nr_queued) pulled_task = -1; out: @@ -13619,7 +13655,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) parent_cfs_rq->idle_nr_running--; } - idle_task_delta = grp_cfs_rq->h_nr_running - + idle_task_delta = grp_cfs_rq->h_nr_queued - grp_cfs_rq->idle_h_nr_running; if (!cfs_rq_is_idle(grp_cfs_rq)) idle_task_delta *= -1; diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c48900b856a2..52ca8e268cfc 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -80,7 +80,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) long nr_active, delta = 0; nr_active = this_rq->nr_running - adjust; - nr_active += (int)this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 171a802420a1..8189a35e53fe 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = grq->h_nr_running + * se_runnable() = grq->h_nr_queued * * runnable_sum = se_runnable() * runnable = grq->runnable_sum * runnable_avg = runnable_sum @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, + cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 84dad1511d1e..7d0f8fdd48a3 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -172,17 +172,35 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); + +static inline void psi_write_begin(int cpu) +{ + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline void psi_write_end(int cpu) +{ + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline u32 psi_read_begin(int cpu) +{ + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline bool psi_read_retry(int cpu, u32 seq) +{ + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); +} + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); static void group_init(struct psi_group *group) { - int cpu; - group->enabled = true; - for_each_possible_cpu(cpu) - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); @@ -262,14 +280,14 @@ static void get_recent_times(struct psi_group *group, int cpu, /* Snapshot a coherent view of the CPU state */ do { - seq = read_seqcount_begin(&groupc->seq); + seq = psi_read_begin(cpu); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; if (cpu == current_cpu) memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); - } while (read_seqcount_retry(&groupc->seq, seq)); + } while (psi_read_retry(cpu, seq)); /* Calculate state time deltas against the previous snapshot */ for (s = 0; s < NR_PSI_STATES; s++) { @@ -768,31 +786,21 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) groupc->times[PSI_NONIDLE] += delta; } +#define for_each_group(iter, group) \ + for (typeof(group) iter = group; iter; iter = iter->parent) + static void psi_group_change(struct psi_group *group, int cpu, unsigned int clear, unsigned int set, - bool wake_clock) + u64 now, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; - u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we update the task counts according to the state - * change requested through the @clear and @set bits. - * - * Then if the cgroup PSI stats accounting enabled, we - * assess the aggregate resource states this CPU's tasks - * have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - */ - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - - /* * Start with TSK_ONCPU, which doesn't have a corresponding * task count - it's just a boolean flag directly encoded in * the state mask. Clear, set, or carry the current state if @@ -843,7 +851,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); return; } @@ -864,8 +871,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); - if (state_mask & group->rtpoll_states) psi_schedule_rtpoll_work(group, 1, false); @@ -900,24 +905,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); - struct psi_group *group; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - group = task_psi_group(task); - do { - psi_group_change(group, cpu, clear, set, true); - } while ((group = group->parent)); + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(task)) + psi_group_change(group, cpu, clear, set, now, true); + psi_write_end(cpu); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep) { - struct psi_group *group, *common = NULL; + struct psi_group *common = NULL; int cpu = task_cpu(prev); + u64 now; + + psi_write_begin(cpu); + now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -926,16 +936,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - group = task_psi_group(next); - do { - if (per_cpu_ptr(group->pcpu, cpu)->state_mask & - PSI_ONCPU) { + for_each_group(group, task_psi_group(next)) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + if (groupc->state_mask & PSI_ONCPU) { common = group; break; } - - psi_group_change(group, cpu, 0, TSK_ONCPU, true); - } while ((group = group->parent)); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + } } if (prev->pid) { @@ -968,12 +977,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - group = task_psi_group(prev); - do { + for_each_group(group, task_psi_group(prev)) { if (group == common) break; - psi_group_change(group, cpu, clear, set, wake_clock); - } while ((group = group->parent)); + psi_group_change(group, cpu, clear, set, now, wake_clock); + } /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -983,20 +991,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, wake_clock); + for_each_group(group, common) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } + psi_write_end(cpu); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { int cpu = task_cpu(curr); - struct psi_group *group; struct psi_group_cpu *groupc; s64 delta; u64 irq; + u64 now; if (static_branch_likely(&psi_disabled)) return; @@ -1005,8 +1014,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; lockdep_assert_rq_held(rq); - group = task_psi_group(curr); - if (prev && task_psi_group(prev) == group) + if (prev && task_psi_group(prev) == task_psi_group(curr)) return; irq = irq_time_read(cpu); @@ -1015,25 +1023,22 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; rq->psi_irq_time = irq; - do { - u64 now; + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(curr)) { if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; - write_seqcount_end(&groupc->seq); - if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) psi_schedule_rtpoll_work(group, 1, false); - } while ((group = group->parent)); + } + psi_write_end(cpu); } #endif @@ -1221,12 +1226,14 @@ void psi_cgroup_restart(struct psi_group *group) return; for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; + u64 now; - rq_lock_irq(rq, &rf); - psi_group_change(group, cpu, 0, 0, true); - rq_unlock_irq(rq, &rf); + guard(rq_lock_irq)(cpu_rq(cpu)); + + psi_write_begin(cpu); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + psi_write_end(cpu); } } #endif /* CONFIG_CGROUPS */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 172c588de542..6ad6717084ed 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2951,6 +2951,12 @@ undo: } mutex_unlock(&mutex); + /* + * After changing maximum available bandwidth for DEADLINE, we need to + * recompute per root domain and per cpus variables accordingly. + */ + rebuild_sched_domains(); + return ret; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d79de755c1c2..a441990fe808 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -651,7 +651,8 @@ struct balance_callback { struct cfs_rq { struct load_weight load; unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ unsigned int h_nr_delayed; @@ -907,7 +908,7 @@ static inline void se_update_runnable(struct sched_entity *se) if (!entity_is_task(se)) { struct cfs_rq *cfs_rq = se->my_q; - se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; + se->runnable_weight = cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed; } } @@ -1155,7 +1156,7 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned int nr_uninterruptible; + unsigned long nr_uninterruptible; struct task_struct __rcu *curr; struct sched_dl_entity *dl_server; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9748a4c8d668..4bd825c24e26 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2174,6 +2174,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) goto unlock; hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp); + if (!hop_masks) + goto unlock; hop = hop_masks - k.masks; ret = hop ? diff --git a/kernel/signal.c b/kernel/signal.c index 2ae45e6eb6bb..468b589c39e6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2063,8 +2063,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* - * tsk is a group leader and has no threads, wake up the - * non-PIDFD_THREAD waiters. + * Notify for thread-group leaders without subthreads. */ if (thread_group_empty(tsk)) do_notify_pidfd(tsk); diff --git a/kernel/softirq.c b/kernel/softirq.c index 8c4524ce65fa..00ff17635041 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -126,6 +126,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key bh_lock_key; +struct lockdep_map bh_lock_map = { + .name = "local_bh", + .key = &bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ + .lock_type = LD_LOCK_PERCPU, +}; +EXPORT_SYMBOL_GPL(bh_lock_map); +#endif + /** * local_bh_blocked() - Check for idle whether BH processing is blocked * @@ -148,6 +160,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); + lock_map_acquire_read(&bh_lock_map); + /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { @@ -211,6 +225,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); + lock_map_release(&bh_lock_map); + local_irq_save(flags); curcnt = __this_cpu_read(softirq_ctrl.cnt); @@ -261,6 +277,8 @@ static inline void ksoftirqd_run_begin(void) /* Counterpart to ksoftirqd_run_begin() */ static inline void ksoftirqd_run_end(void) { + /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */ + lock_map_release(&bh_lock_map); __local_bh_enable(SOFTIRQ_OFFSET, true); WARN_ON_ONCE(in_interrupt()); local_irq_enable(); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index da821ce258ea..d758e66ad59e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done) } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work, - struct wake_q_head *wakeq) + struct cpu_stop_work *work) { list_add_tail(&work->list, &stopper->works); - wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; @@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) - __cpu_stop_queue_work(stopper, work, &wakeq); + __cpu_stop_queue_work(stopper, work); else if (work->done) cpu_stop_signal_done(work->done); raw_spin_unlock_irqrestore(&stopper->lock, flags); - wake_up_q(&wakeq); + if (enabled) + wake_up_process(stopper->thread); preempt_enable(); return enabled; @@ -263,7 +261,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); - DEFINE_WAKE_Q(wakeq); int err; retry: @@ -299,8 +296,8 @@ retry: } err = 0; - __cpu_stop_queue_work(stopper1, work1, &wakeq); - __cpu_stop_queue_work(stopper2, work2, &wakeq); + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); unlock: raw_spin_unlock(&stopper2->lock); @@ -315,7 +312,10 @@ unlock: goto retry; } - wake_up_q(&wakeq); + if (!err) { + wake_up_process(stopper1->thread); + wake_up_process(stopper2->thread); + } preempt_enable(); return err; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 58fb7280cabb..ae862ad9642c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -302,7 +302,7 @@ static void clocksource_verify_choose_cpus(void) { int cpu, i, n = verify_n_cpus; - if (n < 0) { + if (n < 0 || n >= num_online_cpus()) { /* Check all of the CPUs. */ cpumask_copy(&cpus_chosen, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index db9c06bb2311..cd9cb7ccb540 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; - static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) { if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -808,10 +798,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -1597,14 +1587,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; - - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; + switch (clock_id) { + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, @@ -2291,11 +2286,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 6bcee4704059..d44641108ba8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1401,6 +1401,15 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); /* + * Ensure that release_task(tsk) can't happen while + * handle_posix_cpu_timers() is running. Otherwise, a concurrent + * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and + * miss timer->it.cpu.firing != 0. + */ + if (tsk->exit_state) + return; + + /* * If the actual expiry is deferred to task work context and the * work is already scheduled there is no point to do anything here. */ diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4576aaed13b2..c5d9bfbb082b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -118,6 +118,7 @@ static int posix_timer_add(struct k_itimer *timer) return id; } spin_unlock(&hash_lock); + cond_resched(); } /* POSIX return code when no timer ID could be allocated */ return -EAGAIN; @@ -513,14 +514,21 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - /* This makes the timer valid in the hash table */ - WRITE_ONCE(new_timer->it_signal, current->signal); - hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); /* - * After unlocking sighand::siglock @new_timer is subject to - * concurrent removal and cannot be touched anymore + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* This makes the timer valid in the hash table */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ return 0; out: diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a47bcf71defc..9a3859443c04 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -509,6 +509,7 @@ void tick_resume(void) #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP); static unsigned int tick_freeze_depth; /** @@ -528,9 +529,22 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + /* + * All other CPUs have their interrupts disabled and are + * suspended to idle. Other tasks have been frozen so there + * is no scheduling happening. This means that there is no + * concurrency in the system at this point. Therefore it is + * okay to acquire a sleeping lock on PREEMPT_RT, such as a + * spinlock, because the lock cannot be held by other CPUs + * or threads and acquiring it cannot block. + * + * Inform lockdep about the situation. + */ + lock_map_acquire_try(&tick_freeze_map); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); + lock_map_release(&tick_freeze_map); } else { tick_suspend_local(); } @@ -552,8 +566,16 @@ void tick_unfreeze(void) raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { + /* + * Similar to tick_freeze(). On resumption the first CPU may + * acquire uncontended sleeping locks while other CPUs block on + * tick_freeze_lock. + */ + lock_map_acquire_try(&tick_freeze_map); timekeeping_resume(); sched_clock_resume(); + lock_map_release(&tick_freeze_map); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 96933082431f..e3896b2be453 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1218,7 +1218,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { - struct system_counterval_t system_counterval; + struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1c311c46da50..cfbb46cc4e76 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,7 +98,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 55f279ddfd63..3ec7df7dbeec 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -403,7 +403,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_SIZE, }; -static void __set_printk_clr_event(void) +static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, @@ -416,10 +416,11 @@ static void __set_printk_clr_event(void) if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } +static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } @@ -462,7 +463,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } @@ -1827,7 +1828,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void) struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); - if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } @@ -2931,6 +2932,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_kprobe_multi(prog)) return -EINVAL; @@ -3345,7 +3349,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); + rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index c12335499ec9..988a4c4ba97b 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1316,6 +1316,8 @@ error: ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 90b59c627bb8..370cde32c696 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4569,13 +4569,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); } + } else { + if (hash) + iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash); + else + iter->hash = EMPTY_HASH; + } - if (!iter->hash) { - trace_parser_put(&iter->parser); - goto out_unlock; - } - } else - iter->hash = hash; + if (!iter->hash) { + trace_parser_put(&iter->parser); + goto out_unlock; + } ret = 0; @@ -5944,9 +5948,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) /* Make a copy hash to place the new and the old entries in */ size = hash->count + direct_functions->count; - if (size > 32) - size = 32; - new_hash = alloc_ftrace_hash(fls(size)); + size = fls(size); + if (size > FTRACE_HASH_MAX_BITS) + size = FTRACE_HASH_MAX_BITS; + new_hash = alloc_ftrace_hash(size); if (!new_hash) goto out_unlock; @@ -6444,9 +6449,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ftrace_hash_move_and_update_ops(iter->ops, orig_hash, iter->hash, filter_hash); mutex_unlock(&ftrace_lock); - } else { - /* For read only, the hash is the ops hash */ - iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); @@ -7294,9 +7296,10 @@ void ftrace_release_mod(struct module *mod) mutex_lock(&ftrace_lock); - if (ftrace_disabled) - goto out_unlock; - + /* + * To avoid the UAF problem after the module is unloaded, the + * 'mod_map' resource needs to be released unconditionally. + */ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); @@ -7305,6 +7308,9 @@ void ftrace_release_mod(struct module *mod) } } + if (ftrace_disabled) + goto out_unlock; + /* * Each module has its own ftrace_pages, remove * them from the list. diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 314ffc143039..acb0c971a408 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data) { int i; int s = MIN(burst_size, NR_TEST_FUNCS); - struct cpumask cpu_mask; + cpumask_var_t cpu_mask; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; if (cpu_affinity > -1) { - cpumask_clear(&cpu_mask); - cpumask_set_cpu(cpu_affinity, &cpu_mask); - if (set_cpus_allowed_ptr(current, &cpu_mask)) + cpumask_clear(cpu_mask); + cpumask_set_cpu(cpu_affinity, cpu_mask); + if (set_cpus_allowed_ptr(current, cpu_mask)) pr_err("cpu_affinity:%d, failed\n", cpu_affinity); } @@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data) __set_current_state(TASK_RUNNING); + free_cpumask_var(cpu_mask); + return 0; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e1ffbed8cc5e..95641a46db4c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1832,10 +1832,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) head_page = cpu_buffer->head_page; - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { @@ -2794,6 +2796,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; + /* + * Keep CPUs from coming online while resizing to synchronize + * with new per CPU buffers being created. + */ + guard(cpus_read_lock)(); + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); @@ -2838,7 +2846,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cond_resched(); } - cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2878,7 +2885,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2906,8 +2912,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - cpus_read_lock(); - /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); @@ -2926,7 +2930,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - cpus_read_unlock(); } out: @@ -5810,24 +5813,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * ring_buffer_read_start - start a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * - * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer resizing - * is disabled, and the iterator pointer is returned to the caller. - * - * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_read_prepare_sync. - * Afterwards, ring_buffer_read_start is invoked to get things going - * for real. + * This creates an iterator to allow non-consuming iteration through + * the buffer. If the buffer is disabled for writing, it will produce + * the same information each time, but if the buffer is still writing + * then the first hit of a write will cause the iteration to stop. * - * This overall must be paired with ring_buffer_read_finish. + * Must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) +ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -5853,51 +5852,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) atomic_inc(&cpu_buffer->resize_disabled); - return iter; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); - -/** - * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls - * - * All previously invoked ring_buffer_read_prepare calls to prepare - * iterators will be synchronized. Afterwards, read_buffer_read_start - * calls on those iterators are allowed. - */ -void -ring_buffer_read_prepare_sync(void) -{ - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); - -/** - * ring_buffer_read_start - start a non consuming read of the buffer - * @iter: The iterator returned by ring_buffer_read_prepare - * - * This finalizes the startup of an iteration through the buffer. - * The iterator comes from a call to ring_buffer_read_prepare and - * an intervening ring_buffer_read_prepare_sync must have been - * performed. - * - * Must be paired with ring_buffer_read_finish. - */ -void -ring_buffer_read_start(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -6752,7 +6712,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ @@ -6857,7 +6817,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); return 0; @@ -6866,7 +6825,6 @@ error: buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -7272,8 +7230,8 @@ consume: /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; - if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - if (missed_events) { + if (missed_events) { + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* @@ -7294,13 +7252,23 @@ consume: local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); + } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, + "Reader on commit with %ld missed events", + missed_events)) { + /* + * There shouldn't be any missed events if the tail_page + * is on the reader page. But if the tail page is not on the + * reader page and the commit_page is, that would mean that + * there's a commit_overrun (an interrupt preempted an + * addition of an event and then filled the buffer + * with new events). In this case it's not an + * error, but it should still be reported. + * + * TODO: Add missed events to the page for user space to know. + */ + pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", + cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } - } else { - /* - * There really shouldn't be any missed events if the commit - * is on the reader page. - */ - WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ffe1422ab03f..91e6bf1b101a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -787,7 +787,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { - trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } @@ -824,6 +827,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_clear(&parser); ret = 0; } + out: trace_parser_put(&parser); if (ret < 0) { @@ -1754,7 +1758,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; @@ -1768,7 +1772,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; } @@ -1778,8 +1782,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; - ret = read; - goto out; + return read; } } @@ -1789,11 +1792,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; - goto out; + goto fail; } + ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; } @@ -1809,13 +1813,13 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; - goto out; + goto fail; } *ppos += read; - ret = read; - -out: + return read; +fail: + trace_parser_fail(parser); return ret; } @@ -2318,10 +2322,10 @@ int __init register_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) - goto out_unlock; + return ret; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) - goto out_unlock; + return 0; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ @@ -2333,8 +2337,7 @@ int __init register_tracer(struct tracer *type) /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); - out_unlock: - return ret; + return 0; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) @@ -3343,10 +3346,9 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); -__printf(3, 0) -static int -__trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_print; struct ring_buffer_event *event; @@ -3399,7 +3401,6 @@ out_nobuffer: return len; } -__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { @@ -3429,7 +3430,6 @@ int trace_array_vprintk(struct trace_array *tr, * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ -__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3474,7 +3474,6 @@ int trace_array_init_printk(struct trace_array *tr) } EXPORT_SYMBOL_GPL(trace_array_init_printk); -__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3490,7 +3489,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer, return ret; } -__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(printk_trace, ip, fmt, args); @@ -4653,21 +4651,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - } - ring_buffer_read_prepare_sync(); - for_each_tracing_cpu(cpu) { - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - ring_buffer_read_prepare_sync(); - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } @@ -6703,13 +6695,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - trace_seq_used(&iter->seq)); + min((size_t)trace_seq_used(&iter->seq), + (size_t)PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } @@ -6960,7 +6953,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + len = copy_from_user_nofault(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; @@ -7031,7 +7024,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, entry = ring_buffer_event_data(event); - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + len = copy_from_user_nofault(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); @@ -8573,12 +8566,12 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, out_reg: ret = tracing_arm_snapshot(tr); if (ret < 0) - goto out; + return ret; ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret < 0) tracing_disarm_snapshot(tr); - out: + return ret < 0 ? ret : 0; } @@ -10141,10 +10134,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); + + trace_printk_seq(&iter.seq); } touch_nmi_watchdog(); - - trace_printk_seq(&iter.seq); } if (!cnt) @@ -10479,7 +10472,7 @@ __init static int tracer_alloc_buffers(void) BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) - goto out; + return -ENOMEM; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; @@ -10592,7 +10585,6 @@ out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -out: return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 04ea327198ba..9b2ae7652cbc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -818,13 +818,15 @@ static inline void __init disable_tracing_selftest(const char *reason) extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); -extern int -trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_vprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args); + +__printf(2, 0) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -1228,6 +1230,7 @@ bool ftrace_event_is_function(struct trace_event_call *call); */ struct trace_parser { bool cont; + bool fail; char *buffer; unsigned idx; unsigned size; @@ -1235,7 +1238,7 @@ struct trace_parser { static inline bool trace_parser_loaded(struct trace_parser *parser) { - return (parser->idx != 0); + return !parser->fail && parser->idx != 0; } static inline bool trace_parser_cont(struct trace_parser *parser) @@ -1249,6 +1252,11 @@ static inline void trace_parser_clear(struct trace_parser *parser) parser->idx = 0; } +static inline void trace_parser_fail(struct trace_parser *parser) +{ + parser->fail = true; +} + extern int trace_parser_get_init(struct trace_parser *parser, int size); extern void trace_parser_put(struct trace_parser *parser); extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, @@ -1729,6 +1737,9 @@ extern int event_enable_register_trigger(char *glob, extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); +extern struct event_trigger_data * +trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param, + void *private_data); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, @@ -1755,11 +1766,6 @@ extern bool event_trigger_check_remove(const char *glob); extern bool event_trigger_empty_param(const char *param); extern int event_trigger_separate_filter(char *param_and_filter, char **param, char **filter, bool param_required); -extern struct event_trigger_data * -event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data); extern int event_trigger_parse_num(char *trigger, struct event_trigger_data *trigger_data); extern int event_trigger_set_filter(struct event_command *cmd_ops, @@ -2145,7 +2151,7 @@ static inline bool is_good_system_name(const char *name) static inline void sanitize_event_name(char *name) { while (*name++ != '\0') - if (*name == ':' || *name == '.') + if (*name == ':' || *name == '.' || *name == '*') *name = '_'; } diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4376887e0d8a..c9b0533407ed 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -125,6 +125,20 @@ out: return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 11dea25ef880..dbea76058863 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call) case '%': continue; case 'p': + do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': @@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call) continue; if (fmt[i + j] == '*') { star = true; + /* Handle %*pbl case */ + if (!j && fmt[i + 1] == 'p') { + arg++; + i++; + goto do_pointer; + } continue; } if ((fmt[i + j] == 's')) { @@ -2872,7 +2879,10 @@ __register_event(struct trace_event_call *call, struct module *mod) if (ret < 0) return ret; + down_write(&trace_event_sem); list_add(&call->list, &ftrace_events); + up_write(&trace_event_sem); + if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else @@ -3464,6 +3474,8 @@ __trace_add_event_dirs(struct trace_array *tr) struct trace_event_call *call; int ret; + lockdep_assert_held(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0c611b281a5b..f50c2ad43f3d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4ebafc655223..3379e14d38e9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5249,17 +5249,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, } } +/* + * The hist_pad structure is used to save information to create + * a histogram from the histogram trigger. It's too big to store + * on the stack, so when the histogram trigger is initialized + * a percpu array of 4 hist_pad structures is allocated. + * This will cover every context from normal, softirq, irq and NMI + * in the very unlikely event that a tigger happens at each of + * these contexts and interrupts a currently active trigger. + */ +struct hist_pad { + unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; + char compound_key[HIST_KEY_SIZE_MAX]; +}; + +static struct hist_pad __percpu *hist_pads; +static DEFINE_PER_CPU(int, hist_pad_cnt); +static refcount_t hist_pad_ref; + +/* One hist_pad for every context (normal, softirq, irq, NMI) */ +#define MAX_HIST_CNT 4 + +static int alloc_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (refcount_read(&hist_pad_ref)) { + refcount_inc(&hist_pad_ref); + return 0; + } + + hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT, + __alignof__(struct hist_pad)); + if (!hist_pads) + return -ENOMEM; + + refcount_set(&hist_pad_ref, 1); + return 0; +} + +static void free_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (!refcount_dec_and_test(&hist_pad_ref)) + return; + + free_percpu(hist_pads); + hist_pads = NULL; +} + +static struct hist_pad *get_hist_pad(void) +{ + struct hist_pad *hist_pad; + int cnt; + + if (WARN_ON_ONCE(!hist_pads)) + return NULL; + + preempt_disable(); + + hist_pad = per_cpu_ptr(hist_pads, smp_processor_id()); + + if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) { + preempt_enable(); + return NULL; + } + + cnt = this_cpu_inc_return(hist_pad_cnt) - 1; + + return &hist_pad[cnt]; +} + +static void put_hist_pad(void) +{ + this_cpu_dec(hist_pad_cnt); + preempt_enable(); +} + static void event_hist_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); - unsigned long entries[HIST_STACKTRACE_DEPTH]; - u64 var_ref_vals[TRACING_MAP_VARS_MAX]; - char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; struct hist_field *key_field; + struct hist_pad *hist_pad; u64 field_contents; void *key = NULL; unsigned int i; @@ -5267,12 +5344,18 @@ static void event_hist_trigger(struct event_trigger_data *data, if (unlikely(!rbe)) return; - memset(compound_key, 0, hist_data->key_size); + hist_pad = get_hist_pad(); + if (!hist_pad) + return; + + memset(hist_pad->compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + unsigned long *entries = hist_pad->entries; + memset(entries, 0, HIST_STACKTRACE_SIZE); if (key_field->field) { unsigned long *stack, n_entries; @@ -5296,26 +5379,31 @@ static void event_hist_trigger(struct event_trigger_data *data, } if (use_compound_key) - add_to_key(compound_key, key, key_field, rec); + add_to_key(hist_pad->compound_key, key, key_field, rec); } if (use_compound_key) - key = compound_key; + key = hist_pad->compound_key; if (hist_data->n_var_refs && - !resolve_var_refs(hist_data, key, var_ref_vals, false)) - return; + !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false)) + goto out; elt = tracing_map_insert(hist_data->map, key); if (!elt) - return; + goto out; - hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals); - if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) { + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, + key, hist_pad->var_ref_vals); + } hist_poll_wakeup(); + + out: + put_hist_pad(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -6160,6 +6248,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; + if (alloc_hist_pad() < 0) + return -ENOMEM; + if (!data->ref && hist_data->attrs->name) save_named_trigger(hist_data->attrs->name, data); @@ -6204,6 +6295,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data) destroy_hist_data(hist_data); } + free_hist_pad(); } static struct event_trigger_ops event_hist_trigger_ops = { @@ -6219,9 +6311,7 @@ static int event_hist_trigger_named_init(struct event_trigger_data *data) save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(data->named_data); - - return 0; + return event_hist_trigger_init(data->named_data); } static void event_hist_trigger_named_free(struct event_trigger_data *data) @@ -6708,7 +6798,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a5e3d6acf1e1..d5dbda9b0e4b 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -825,7 +825,7 @@ out: } /** - * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * trigger_data_alloc - allocate and init event_trigger_data for a trigger * @cmd_ops: The event_command operations for the trigger * @cmd: The cmd string * @param: The param string @@ -836,14 +836,14 @@ out: * trigger_ops to assign to the event_trigger_data. @private_data can * also be passed in and associated with the event_trigger_data. * - * Use event_trigger_free() to free an event_trigger_data object. + * Use trigger_data_free() to free an event_trigger_data object. * * Return: The trigger_data object success, NULL otherwise */ -struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data) +struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) { struct event_trigger_data *trigger_data; struct event_trigger_ops *trigger_ops; @@ -1010,13 +1010,13 @@ event_trigger_parse(struct event_command *cmd_ops, return ret; ret = -ENOMEM; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file); if (!trigger_data) goto out; if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); - kfree(trigger_data); + trigger_data_free(trigger_data); ret = 0; goto out; } @@ -1043,7 +1043,7 @@ event_trigger_parse(struct event_command *cmd_ops, out_free: event_trigger_reset_filter(cmd_ops, trigger_data); - kfree(trigger_data); + trigger_data_free(trigger_data); goto out; } @@ -1581,7 +1581,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } @@ -1814,7 +1814,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data); if (!trigger_data) { kfree(enable_data); goto out; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index fbbc3c719d2f..fbb6cf8dc047 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -574,11 +574,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 59857a1ee44c..628c25693cef 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.array_buffer->buffer, - cpu, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu]); + ring_buffer_read_start(iter.array_buffer->buffer, + cpu, GFP_ATOMIC); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.array_buffer->buffer, + ring_buffer_read_start(iter.array_buffer->buffer, cpu_file, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 935a886af40c..6b9c3f3f870f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1090,7 +1090,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a94790f5cda7..216247913980 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -665,8 +665,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u entry = ring_buffer_event_data(event); - memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; + memcpy(&entry->caller, fstack->calls, size); if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit_nostack(buffer, event); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 6e7090e8bf30..1ecc28a3946a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -950,11 +950,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, struct trace_event_call *call; struct list_head *head; + lockdep_assert_held_read(&trace_event_sem); + /* ftrace defined events have separate call structures */ if (event->type <= __TRACE_LAST_TYPE) { bool found = false; - down_read(&trace_event_sem); list_for_each_entry(call, &ftrace_events, list) { if (call->event.type == event->type) { found = true; @@ -964,7 +965,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, if (call->event.type > __TRACE_LAST_TYPE) break; } - up_read(&trace_event_sem); if (!found) { trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); goto out; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 578919962e5d..055f5164bd96 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,9 +154,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; @@ -648,7 +657,7 @@ static int parse_btf_arg(char *varname, ret = query_btf_context(ctx); if (ret < 0 || ctx->nr_params == 0) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); - return PTR_ERR(params); + return -ENOENT; } } params = ctx->params; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b085a8a164ea..9916677acf24 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -739,7 +739,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/ucount.c b/kernel/ucount.c index 696406939be5..78f4c4255358 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -212,7 +212,7 @@ void put_ucounts(struct ucounts *ucounts) } } -static inline bool atomic_long_inc_below(atomic_long_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, long u) { long c, old; c = atomic_long_read(v); diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2ef2e1b80091..2f844c279a3e 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(vhost_task_stop); * @arg: data to be passed to fn and handled_kill * @name: the thread's name * - * This returns a specialized task for use by the vhost layer or NULL on + * This returns a specialized task for use by the vhost layer or ERR_PTR() on * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4dc72540c3b0..8fbb4385e814 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_thresh_next; static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; @@ -863,12 +864,20 @@ int lockup_detector_offline_cpu(unsigned int cpu) return 0; } -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); softlockup_stop_all(); + /* + * To prevent watchdog_timer_fn from using the old interval and + * the new watchdog_thresh at the same time, which could lead to + * false softlockup reports, it is necessary to update the + * watchdog_thresh after the softlockup is completed. + */ + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) @@ -881,7 +890,7 @@ static void __lockup_detector_reconfigure(void) void lockup_detector_reconfigure(void) { mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); mutex_unlock(&watchdog_mutex); } @@ -901,27 +910,29 @@ static __init void lockup_detector_setup(void) return; mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); lockup_detector_update_enable(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } static inline void lockup_detector_setup(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -939,11 +950,11 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL /* Propagate any changes to the watchdog infrastructure */ -static void proc_watchdog_update(void) +static void proc_watchdog_update(bool thresh_changed) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(thresh_changed); } /* @@ -976,7 +987,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr old = READ_ONCE(*param); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) - proc_watchdog_update(); + proc_watchdog_update(false); } mutex_unlock(&watchdog_mutex); return err; @@ -1027,11 +1038,13 @@ static int proc_watchdog_thresh(const struct ctl_table *table, int write, mutex_lock(&watchdog_mutex); - old = READ_ONCE(watchdog_thresh); + watchdog_thresh_next = READ_ONCE(watchdog_thresh); + + old = watchdog_thresh_next; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!err && write && old != READ_ONCE(watchdog_thresh)) - proc_watchdog_update(); + if (!err && write && old != READ_ONCE(watchdog_thresh_next)) + proc_watchdog_update(true); mutex_unlock(&watchdog_mutex); return err; @@ -1052,7 +1065,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) - proc_watchdog_update(); + proc_watchdog_update(false); mutex_unlock(&watchdog_mutex); return err; @@ -1072,7 +1085,7 @@ static struct ctl_table watchdog_sysctls[] = { }, { .procname = "watchdog_thresh", - .data = &watchdog_thresh, + .data = &watchdog_thresh_next, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a9d64e08dffc..3c87eb98609c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7731,7 +7731,8 @@ void __init workqueue_init_early(void) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); - + cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); |