diff options
Diffstat (limited to 'kernel')
77 files changed, 4208 insertions, 1576 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888e..ce1435cb08b1 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -30,7 +30,7 @@ choice 250 Hz is a good compromise choice allowing server performance while also showing good interactive responsiveness even on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. + or multimedia, select 300Hz instead. config HZ_300 bool "300 HZ" diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 410028633621..70502f038b92 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 095a9554e1de..0d56cea71602 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, - node_id, page_cnt, pages); + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; @@ -577,8 +576,8 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 54ff2a85d4c0..148da8f7ff36 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -161,6 +161,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; + bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -169,21 +170,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - if (!bpf_cgrp_storage_trylock()) - return (unsigned long)NULL; + nobusy = bpf_cgrp_storage_trylock(); - sdata = cgroup_storage_lookup(cgroup, map, true); + sdata = cgroup_storage_lookup(cgroup, map, nobusy); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); unlock: - bpf_cgrp_storage_unlock(); + if (nobusy) + bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 106735145948..380e9a7cac75 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -335,7 +335,7 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, tinfo->btf_id = prog->aux->attach_btf_id; } -bool bpf_iter_prog_supported(struct bpf_prog *prog) +int bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; struct bpf_iter_target_info *tinfo = NULL, *iter; @@ -344,7 +344,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) int prefix_len = strlen(prefix); if (strncmp(attach_fname, prefix, prefix_len)) - return false; + return -EINVAL; mutex_lock(&targets_mutex); list_for_each_entry(iter, &targets, list) { @@ -360,12 +360,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); - if (tinfo) { - prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; - prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; - } + if (!tinfo) + return -EINVAL; - return tinfo != NULL; + return bpf_prog_ctx_arg_info_init(prog, tinfo->reg_info->ctx_arg_info, + tinfo->reg_info->ctx_arg_info_size); } const struct bpf_func_proto * diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 967492b65185..0a59df1c550a 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -316,7 +316,9 @@ BTF_ID(func, bpf_lsm_inode_getxattr) BTF_ID(func, bpf_lsm_inode_mknod) BTF_ID(func, bpf_lsm_inode_need_killpriv) BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_post_removexattr) BTF_ID(func, bpf_lsm_inode_readlink) +BTF_ID(func, bpf_lsm_inode_removexattr) BTF_ID(func, bpf_lsm_inode_rename) BTF_ID(func, bpf_lsm_inode_rmdir) BTF_ID(func, bpf_lsm_inode_setattr) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 040fb1cd840b..db13ee70d94d 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -146,39 +146,7 @@ void bpf_struct_ops_image_free(void *image) } #define MAYBE_NULL_SUFFIX "__nullable" -#define MAX_STUB_NAME 128 - -/* Return the type info of a stub function, if it exists. - * - * The name of a stub function is made up of the name of the struct_ops and - * the name of the function pointer member, separated by "__". For example, - * if the struct_ops type is named "foo_ops" and the function pointer - * member is named "bar", the stub function name would be "foo_ops__bar". - */ -static const struct btf_type * -find_stub_func_proto(const struct btf *btf, const char *st_op_name, - const char *member_name) -{ - char stub_func_name[MAX_STUB_NAME]; - const struct btf_type *func_type; - s32 btf_id; - int cp; - - cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", - st_op_name, member_name); - if (cp >= MAX_STUB_NAME) { - pr_warn("Stub function name too long\n"); - return NULL; - } - btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); - if (btf_id < 0) - return NULL; - func_type = btf_type_by_id(btf, btf_id); - if (!func_type) - return NULL; - - return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ -} +#define REFCOUNTED_SUFFIX "__ref" /* Prepare argument info for every nullable argument of a member of a * struct_ops type. @@ -203,27 +171,44 @@ find_stub_func_proto(const struct btf *btf, const char *st_op_name, static int prepare_arg_info(struct btf *btf, const char *st_ops_name, const char *member_name, - const struct btf_type *func_proto, + const struct btf_type *func_proto, void *stub_func_addr, struct bpf_struct_ops_arg_info *arg_info) { const struct btf_type *stub_func_proto, *pointed_type; + bool is_nullable = false, is_refcounted = false; const struct btf_param *stub_args, *args; struct bpf_ctx_arg_aux *info, *info_buf; u32 nargs, arg_no, info_cnt = 0; + char ksym[KSYM_SYMBOL_LEN]; + const char *stub_fname; + const char *suffix; + s32 stub_func_id; u32 arg_btf_id; int offset; - stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); - if (!stub_func_proto) - return 0; + stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym); + if (!stub_fname) { + pr_warn("Cannot find the stub function name for the %s in struct %s\n", + member_name, st_ops_name); + return -ENOENT; + } + + stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC); + if (stub_func_id < 0) { + pr_warn("Cannot find the stub function %s in btf\n", stub_fname); + return -ENOENT; + } + + stub_func_proto = btf_type_by_id(btf, stub_func_id); + stub_func_proto = btf_type_by_id(btf, stub_func_proto->type); /* Check if the number of arguments of the stub function is the same * as the number of arguments of the function pointer. */ nargs = btf_type_vlen(func_proto); if (nargs != btf_type_vlen(stub_func_proto)) { - pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", - st_ops_name, member_name, member_name, st_ops_name); + pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n", + stub_fname, member_name, st_ops_name); return -EINVAL; } @@ -241,10 +226,18 @@ static int prepare_arg_info(struct btf *btf, info = info_buf; for (arg_no = 0; arg_no < nargs; arg_no++) { /* Skip arguments that is not suffixed with - * "__nullable". + * "__nullable or __ref". */ - if (!btf_param_match_suffix(btf, &stub_args[arg_no], - MAYBE_NULL_SUFFIX)) + is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no], + MAYBE_NULL_SUFFIX); + is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no], + REFCOUNTED_SUFFIX); + + if (is_nullable) + suffix = MAYBE_NULL_SUFFIX; + else if (is_refcounted) + suffix = REFCOUNTED_SUFFIX; + else continue; /* Should be a pointer to struct */ @@ -253,30 +246,34 @@ static int prepare_arg_info(struct btf *btf, &arg_btf_id); if (!pointed_type || !btf_type_is_struct(pointed_type)) { - pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", - st_ops_name, member_name, MAYBE_NULL_SUFFIX); + pr_warn("stub function %s has %s tagging to an unsupported type\n", + stub_fname, suffix); goto err_out; } offset = btf_ctx_arg_offset(btf, func_proto, arg_no); if (offset < 0) { - pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", - st_ops_name, member_name, arg_no); + pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n", + stub_fname, arg_no); goto err_out; } if (args[arg_no].type != stub_args[arg_no].type) { - pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", - arg_no, st_ops_name, member_name); + pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n", + arg_no, stub_fname); goto err_out; } /* Fill the information of the new argument */ - info->reg_type = - PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; info->btf_id = arg_btf_id; info->btf = btf; info->offset = offset; + if (is_nullable) { + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; + } else if (is_refcounted) { + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID; + info->refcounted = true; + } info++; info_cnt++; @@ -324,6 +321,13 @@ static bool is_module_member(const struct btf *btf, u32 id) return !strcmp(btf_name_by_offset(btf, t->name_off), "module"); } +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); + + return func_ptr ? 0 : -ENOTSUPP; +} + int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) @@ -386,8 +390,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, st_ops_desc->value_type = btf_type_by_id(btf, value_id); for_each_member(i, t, member) { - const struct btf_type *func_proto; + const struct btf_type *func_proto, *ret_type; + void **stub_func_addr; + u32 moff; + moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(btf, member->name_off); if (!*mname) { pr_warn("anon member in struct %s is not supported\n", @@ -413,9 +420,23 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); - if (!func_proto) + + /* The member is not a function pointer or + * the function pointer is not supported. + */ + if (!func_proto || bpf_struct_ops_supported(st_ops, moff)) continue; + if (func_proto->type) { + ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL); + if (ret_type && !__btf_type_is_struct(ret_type)) { + pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n", + mname, st_ops->name); + err = -EOPNOTSUPP; + goto errout; + } + } + if (btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[i])) { @@ -425,8 +446,9 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, goto errout; } + stub_func_addr = *(void **)(st_ops->cfi_stubs + moff); err = prepare_arg_info(btf, st_ops->name, mname, - func_proto, + func_proto, stub_func_addr, arg_info + i); if (err) goto errout; @@ -1152,13 +1174,6 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } -int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) -{ - void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); - - return func_ptr ? 0 : -ENOTSUPP; -} - static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index eacb701bc2be..16ba36f34dfa 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -606,6 +606,7 @@ s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) spin_unlock_bh(&btf_idr_lock); return ret; } +EXPORT_SYMBOL_GPL(bpf_find_btf_id); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) @@ -2575,7 +2576,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { + if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } @@ -3332,6 +3333,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; + const char *tag_value; + bool is_type_tag; u32 res_id; /* Permit modifiers on the pointer itself */ @@ -3341,19 +3344,20 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; t = btf_type_by_id(btf, t->type); - - if (!btf_type_is_type_tag(t)) + is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t); + if (!is_type_tag) return BTF_FIELD_IGNORE; /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; - if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off))) + tag_value = __btf_name_by_offset(btf, t->name_off); + if (!strcmp("kptr_untrusted", tag_value)) type = BPF_KPTR_UNREF; - else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("kptr", tag_value)) type = BPF_KPTR_REF; - else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("percpu_kptr", tag_value)) type = BPF_KPTR_PERCPU; - else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("uptr", tag_value)) type = BPF_UPTR; else return -EINVAL; @@ -3477,6 +3481,15 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ goto end; } } + if (field_mask & BPF_RES_SPIN_LOCK) { + if (!strcmp(name, "bpf_res_spin_lock")) { + if (*seen_mask & BPF_RES_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_RES_SPIN_LOCK; + type = BPF_RES_SPIN_LOCK; + goto end; + } + } if (field_mask & BPF_TIMER) { if (!strcmp(name, "bpf_timer")) { if (*seen_mask & BPF_TIMER) @@ -3655,6 +3668,7 @@ static int btf_find_field_one(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_LIST_NODE: @@ -3948,6 +3962,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type return ERR_PTR(-ENOMEM); rec->spin_lock_off = -EINVAL; + rec->res_spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; @@ -3975,6 +3990,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->spin_lock_off = rec->fields[i].offset; break; + case BPF_RES_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->res_spin_lock_off = rec->fields[i].offset; + break; case BPF_TIMER: WARN_ON_ONCE(rec->timer_off >= 0); /* Cache offset for faster lookup at runtime */ @@ -4018,9 +4038,15 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->cnt++; } + if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) { + ret = -EINVAL; + goto end; + } + /* bpf_{list_head, rb_node} require bpf_spin_lock */ if ((btf_record_has_field(rec, BPF_LIST_HEAD) || - btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { + btf_record_has_field(rec, BPF_RB_ROOT)) && + (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) { ret = -EINVAL; goto end; } @@ -4944,11 +4970,6 @@ static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - component_idx = btf_type_decl_tag(t)->component_idx; if (component_idx < -1) { btf_verifier_log_type(env, t, "Invalid component_idx"); @@ -5638,7 +5659,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | BPF_KPTR, t->size); /* The record cannot be unset, treat it as an error if so */ @@ -6507,6 +6528,8 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { /* rxrpc */ { "rxrpc_recvdata", 0x1 }, { "rxrpc_resend", 0x10 }, + { "rxrpc_tq", 0x10 }, + { "rxrpc_client", 0x1 }, /* skb */ {"kfree_skb", 0x1000}, /* sunrpc */ @@ -6529,6 +6552,103 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "mr_integ_alloc", 0x2000 }, /* bpf_testmod */ { "bpf_testmod_test_read", 0x0 }, + /* amdgpu */ + { "amdgpu_vm_bo_map", 0x1 }, + { "amdgpu_vm_bo_unmap", 0x1 }, + /* netfs */ + { "netfs_folioq", 0x1 }, + /* xfs from xfs_defer_pending_class */ + { "xfs_defer_create_intent", 0x1 }, + { "xfs_defer_cancel_list", 0x1 }, + { "xfs_defer_pending_finish", 0x1 }, + { "xfs_defer_pending_abort", 0x1 }, + { "xfs_defer_relog_intent", 0x1 }, + { "xfs_defer_isolate_paused", 0x1 }, + { "xfs_defer_item_pause", 0x1 }, + { "xfs_defer_item_unpause", 0x1 }, + /* xfs from xfs_defer_pending_item_class */ + { "xfs_defer_add_item", 0x1 }, + { "xfs_defer_cancel_item", 0x1 }, + { "xfs_defer_finish_item", 0x1 }, + /* xfs from xfs_icwalk_class */ + { "xfs_ioc_free_eofblocks", 0x10 }, + { "xfs_blockgc_free_space", 0x10 }, + /* xfs from xfs_btree_cur_class */ + { "xfs_btree_updkeys", 0x100 }, + { "xfs_btree_overlapped_query_range", 0x100 }, + /* xfs from xfs_imap_class*/ + { "xfs_map_blocks_found", 0x10000 }, + { "xfs_map_blocks_alloc", 0x10000 }, + { "xfs_iomap_alloc", 0x1000 }, + { "xfs_iomap_found", 0x1000 }, + /* xfs from xfs_fs_class */ + { "xfs_inodegc_flush", 0x1 }, + { "xfs_inodegc_push", 0x1 }, + { "xfs_inodegc_start", 0x1 }, + { "xfs_inodegc_stop", 0x1 }, + { "xfs_inodegc_queue", 0x1 }, + { "xfs_inodegc_throttle", 0x1 }, + { "xfs_fs_sync_fs", 0x1 }, + { "xfs_blockgc_start", 0x1 }, + { "xfs_blockgc_stop", 0x1 }, + { "xfs_blockgc_worker", 0x1 }, + { "xfs_blockgc_flush_all", 0x1 }, + /* xfs_scrub */ + { "xchk_nlinks_live_update", 0x10 }, + /* xfs_scrub from xchk_metapath_class */ + { "xchk_metapath_lookup", 0x100 }, + /* nfsd */ + { "nfsd_dirent", 0x1 }, + { "nfsd_file_acquire", 0x1001 }, + { "nfsd_file_insert_err", 0x1 }, + { "nfsd_file_cons_err", 0x1 }, + /* nfs4 */ + { "nfs4_setup_sequence", 0x1 }, + { "pnfs_update_layout", 0x10000 }, + { "nfs4_inode_callback_event", 0x200 }, + { "nfs4_inode_stateid_callback_event", 0x200 }, + /* nfs from pnfs_layout_event */ + { "pnfs_mds_fallback_pg_init_read", 0x10000 }, + { "pnfs_mds_fallback_pg_init_write", 0x10000 }, + { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 }, + { "pnfs_mds_fallback_read_done", 0x10000 }, + { "pnfs_mds_fallback_write_done", 0x10000 }, + { "pnfs_mds_fallback_read_pagelist", 0x10000 }, + { "pnfs_mds_fallback_write_pagelist", 0x10000 }, + /* coda */ + { "coda_dec_pic_run", 0x10 }, + { "coda_dec_pic_done", 0x10 }, + /* cfg80211 */ + { "cfg80211_scan_done", 0x11 }, + { "rdev_set_coalesce", 0x10 }, + { "cfg80211_report_wowlan_wakeup", 0x100 }, + { "cfg80211_inform_bss_frame", 0x100 }, + { "cfg80211_michael_mic_failure", 0x10000 }, + /* cfg80211 from wiphy_work_event */ + { "wiphy_work_queue", 0x10 }, + { "wiphy_work_run", 0x10 }, + { "wiphy_work_cancel", 0x10 }, + { "wiphy_work_flush", 0x10 }, + /* hugetlbfs */ + { "hugetlbfs_alloc_inode", 0x10 }, + /* spufs */ + { "spufs_context", 0x10 }, + /* kvm_hv */ + { "kvm_page_fault_enter", 0x100 }, + /* dpu */ + { "dpu_crtc_setup_mixer", 0x100 }, + /* binder */ + { "binder_transaction", 0x100 }, + /* bcachefs */ + { "btree_path_free", 0x100 }, + /* hfi1_tx */ + { "hfi1_sdma_progress", 0x1000 }, + /* iptfs */ + { "iptfs_ingress_postq_event", 0x1000 }, + /* neigh */ + { "neigh_update", 0x10 }, + /* snd_firewire_lib */ + { "amdtp_packet", 0x100 }, }; bool btf_ctx_access(int off, int size, enum bpf_access_type type, @@ -6679,6 +6799,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; + info->ref_obj_id = ctx_arg_info->ref_obj_id; return true; } } @@ -6745,7 +6866,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf_id = t->type; t = btf_type_by_id(btf, t->type); - if (btf_type_is_type_tag(t)) { + if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag_value, "user") == 0) info->reg_type |= MEM_USER; @@ -7004,7 +7125,7 @@ error: /* check type tag */ t = btf_type_by_id(btf, mtype->type); - if (btf_type_is_type_tag(t)) { + if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); /* check __user tag */ if (strcmp(tag_value, "user") == 0) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 46e5db65dbc8..84f58f3d028a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct hlist_head *head) +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) { struct bpf_prog_list *pl; u32 cnt = 0; @@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head) hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) + (*preorder_cnt)++; cnt++; } return cnt; @@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[atype]); + cnt = prog_list_length(&p->bpf.progs[atype], NULL); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp, struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[atype]); + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); p = cgroup_parent(p); } while (p); @@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp, /* populate the array with effective progs */ cnt = 0; p = cgrp; + fstart = preorder_cnt; + bstart = preorder_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; + init_bstart = bstart; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; - item = &progs->items[cnt]; + if (pl->flags & BPF_F_PREORDER) { + item = &progs->items[bstart]; + bstart--; + } else { + item = &progs->items[fstart]; + fstart++; + } item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } + + /* reverse pre-ordering progs at this cgroup level */ + for (i = bstart + 1, j = init_bstart; i < j; i++, j--) + swap(progs->items[i], progs->items[j]); + } while ((p = cgroup_parent(p))); *array = progs; @@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, */ return -EPERM; - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; pl = find_attach_entry(progs, prog, link, replace_prog, @@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; + pl->flags = flags; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; @@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); } else { - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); } } @@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; progs = &cgrp->bpf.progs[atype]; - cnt = min_t(int, prog_list_length(progs), total_cnt); + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); i = 0; hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index da729cbbaeb9..ba6b6118cf50 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1663,14 +1663,17 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ + /* Atomic operations. */ \ + INSN_3(STX, ATOMIC, B), \ + INSN_3(STX, ATOMIC, H), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, ATOMIC, W), \ - INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -2152,24 +2155,33 @@ out: if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ + else \ + goto default_label; \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ + else \ + goto default_label; \ break; STX_ATOMIC_DW: STX_ATOMIC_W: + STX_ATOMIC_H: + STX_ATOMIC_B: switch (IMM) { + /* Atomic read-modify-write instructions support only W and DW + * size modifiers. + */ ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) @@ -2181,20 +2193,63 @@ out: SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); + else + goto default_label; break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); + else + goto default_label; + break; + /* Atomic load and store instructions support all size + * modifiers. + */ + case BPF_LOAD_ACQ: + switch (BPF_SIZE(insn->code)) { +#define LOAD_ACQUIRE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + DST = (SIZE)smp_load_acquire( \ + (SIZE *)(unsigned long)(SRC + insn->off)); \ + break; + LOAD_ACQUIRE(B, u8) + LOAD_ACQUIRE(H, u16) + LOAD_ACQUIRE(W, u32) +#ifdef CONFIG_64BIT + LOAD_ACQUIRE(DW, u64) +#endif +#undef LOAD_ACQUIRE + default: + goto default_label; + } + break; + case BPF_STORE_REL: + switch (BPF_SIZE(insn->code)) { +#define STORE_RELEASE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + smp_store_release( \ + (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \ + break; + STORE_RELEASE(B, u8) + STORE_RELEASE(H, u16) + STORE_RELEASE(W, u32) +#ifdef CONFIG_64BIT + STORE_RELEASE(DW, u64) +#endif +#undef STORE_RELEASE + default: + goto default_label; + } break; default: @@ -2290,17 +2345,18 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) insn->code = BPF_JMP | BPF_CALL_ARGS; } #endif -#else +#endif + static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, so warn about it! + * is not working properly, or interpreter is being used when + * prog->jit_requested is not 0, so warn about it! */ WARN_ON_ONCE(1); return 0; } -#endif bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) @@ -2380,8 +2436,18 @@ static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + u32 idx = (round_up(stack_depth, 32) / 32) - 1; - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; + /* may_goto may cause stack size > 512, leading to idx out-of-bounds. + * But for non-JITed programs, we don't need bpf_func, so no bounds + * check needed. + */ + if (!fp->jit_requested && + !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + fp->bpf_func = interpreters[idx]; + } else { + fp->bpf_func = __bpf_prog_ret0_warn; + } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif @@ -2906,6 +2972,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) return NULL; } +const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void) +{ + return NULL; +} + u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) @@ -3058,6 +3129,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +bool __weak bpf_jit_supports_timed_may_goto(void) +{ + return false; +} + +u64 __weak arch_bpf_timed_may_goto(void) +{ + return 0; +} + +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) +{ + u64 time = ktime_get_mono_fast_ns(); + + /* Populate the timestamp for this stack frame, and refresh count. */ + if (!p->timestamp) { + p->timestamp = time; + return BPF_MAX_TIMED_LOOPS; + } + /* Check if we've exhausted our time slice, and zero count. */ + if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + return 0; + /* Refresh the count for the stack frame. */ + return BPF_MAX_TIMED_LOOPS; +} + /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index cfa1c18e3a48..9876c5fe6c2a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -45,6 +45,10 @@ __bpf_kfunc_start_defs(); * * bpf_cpumask_create() allocates memory using the BPF memory allocator, and * will not block. It may return NULL if no memory is available. + * + * Return: + * * A pointer to a new struct bpf_cpumask instance on success. + * * NULL if the BPF memory allocator is out of memory. */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) { @@ -71,6 +75,10 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) * Acquires a reference to a BPF cpumask. The cpumask returned by this function * must either be embedded in a map as a kptr, or freed with * bpf_cpumask_release(). + * + * Return: + * * The struct bpf_cpumask pointer passed to the function. + * */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) { @@ -106,6 +114,9 @@ CFI_NOSEAL(bpf_cpumask_release_dtor); * * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first nonzero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) { @@ -119,6 +130,9 @@ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) * * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first zero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) { @@ -133,6 +147,9 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) * * Find the index of the first nonzero bit of the AND of two cpumasks. * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + * + * Return: + * * The index of the first bit that is nonzero in both cpumask instances. */ __bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, const struct cpumask *src2) @@ -414,12 +431,47 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, * @cpumask: The cpumask being queried. * * Count the number of set bits in the given cpumask. + * + * Return: + * * The number of bits set in the mask. */ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) { return cpumask_weight(cpumask); } +/** + * bpf_cpumask_populate() - Populate the CPU mask from the contents of + * a BPF memory region. + * + * @cpumask: The cpumask being populated. + * @src: The BPF memory holding the bit pattern. + * @src__sz: Length of the BPF memory region in bytes. + * + * Return: + * * 0 if the struct cpumask * instance was populated successfully. + * * -EACCES if the memory region is too small to populate the cpumask. + * * -EINVAL if the memory region is not aligned to the size of a long + * and the architecture does not support efficient unaligned accesses. + */ +__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz) +{ + unsigned long source = (unsigned long)src; + + /* The memory region must be large enough to populate the entire CPU mask. */ + if (src__sz < bitmap_size(nr_cpu_ids)) + return -EACCES; + + /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + !IS_ALIGNED(source, sizeof(long))) + return -EINVAL; + + bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids); + + return 0; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) @@ -448,6 +500,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU) BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 309c4aa1b026..20883c6b1546 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -202,7 +202,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (is_addr_space_cast(insn)) { - verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (is_mov_percpu_addr(insn)) { @@ -267,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ) { + verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_STORE_REL) { + verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } @@ -369,7 +381,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); + (u32)insn->imm, insn->off); } } else { verbose(cbs->private_data, "(%02x) %s\n", diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4a9eeb7aef85..5a5adc66b8e2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include "bpf_lru_list.h" #include "map_in_map.h" #include <linux/bpf_mem_alloc.h> +#include <asm/rqspinlock.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -78,7 +79,7 @@ */ struct bucket { struct hlist_nulls_head head; - raw_spinlock_t raw_lock; + rqspinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 @@ -104,8 +105,6 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; - struct lock_class_key lockdep_key; - int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -140,45 +139,26 @@ static void htab_init_buckets(struct bpf_htab *htab) for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - raw_spin_lock_init(&htab->buckets[i].raw_lock); - lockdep_set_class(&htab->buckets[i].raw_lock, - &htab->lockdep_key); + raw_res_spin_lock_init(&htab->buckets[i].raw_lock); cond_resched(); } } -static inline int htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b, u32 hash, - unsigned long *pflags) +static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags) { unsigned long flags; + int ret; - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - - preempt_disable(); - local_irq_save(flags); - if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { - __this_cpu_dec(*(htab->map_locked[hash])); - local_irq_restore(flags); - preempt_enable(); - return -EBUSY; - } - - raw_spin_lock(&b->raw_lock); + ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags); + if (ret) + return ret; *pflags = flags; - return 0; } -static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, u32 hash, - unsigned long flags) +static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags) { - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - raw_spin_unlock(&b->raw_lock); - __this_cpu_dec(*(htab->map_locked[hash])); - local_irq_restore(flags); - preempt_enable(); + raw_res_spin_unlock_irqrestore(&b->raw_lock, flags); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -198,12 +178,12 @@ static bool htab_is_percpu(const struct bpf_htab *htab) static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + key_size) = pptr; + *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + key_size); + return *(void __percpu **)(l->key + roundup(key_size, 8)); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) @@ -483,14 +463,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; - int err, i; + int err; htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); - lockdep_register_key(&htab->lockdep_key); - bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { @@ -536,15 +514,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_elem_count; - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { - htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, - sizeof(int), - sizeof(int), - GFP_USER); - if (!htab->map_locked[i]) - goto free_map_locked; - } - if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else @@ -607,15 +576,12 @@ free_prealloc: free_map_locked: if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) - free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); free_elem_count: bpf_map_free_elem_count(&htab->map); free_htab: - lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); return ERR_PTR(err); } @@ -787,6 +753,9 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { + if (IS_ERR_OR_NULL(htab->map.record)) + return; + if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu; @@ -817,7 +786,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) b = __select_bucket(htab, tgt_l->hash); head = &b->head; - ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return false; @@ -828,7 +797,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, tgt_l->hash, flags); + htab_unlock_bucket(b, flags); if (l == tgt_l) check_and_free_fields(htab, l); @@ -1147,7 +1116,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1198,7 +1167,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, check_and_free_fields(htab, l_old); } } - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l_old) { if (old_map_ptr) map->ops->map_fd_put_ptr(map, old_map_ptr, true); @@ -1207,7 +1176,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, } return 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); return ret; } @@ -1254,7 +1223,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value copy_map_value(&htab->map, l_new->key + round_up(map->key_size, 8), value); - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; @@ -1275,7 +1244,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); err_lock_bucket: if (ret) @@ -1312,7 +1281,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1337,7 +1306,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); return ret; } @@ -1378,7 +1347,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; @@ -1402,7 +1371,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); err_lock_bucket: if (l_new) { bpf_map_dec_elem_count(&htab->map); @@ -1444,7 +1413,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1454,7 +1423,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) else ret = -ENOENT; - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l) free_htab_elem(htab, l); @@ -1480,7 +1449,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1491,7 +1460,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) else ret = -ENOENT; - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l) htab_lru_push_free(htab, l); return ret; @@ -1558,7 +1527,6 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1583,9 +1551,6 @@ static void htab_map_free(struct bpf_map *map) bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) - free_percpu(htab->map_locked[i]); - lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); } @@ -1628,7 +1593,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &bflags); + ret = htab_lock_bucket(b, &bflags); if (ret) return ret; @@ -1665,7 +1630,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, hlist_nulls_del_rcu(&l->hash_node); out_unlock: - htab_unlock_bucket(htab, b, hash, bflags); + htab_unlock_bucket(b, bflags); if (l) { if (is_lru_map) @@ -1787,7 +1752,7 @@ again_nocopy: head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { - ret = htab_lock_bucket(htab, b, batch, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) { rcu_read_unlock(); bpf_enable_instrumentation(); @@ -1810,7 +1775,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1821,7 +1786,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1884,7 +1849,7 @@ again_nocopy: dst_val += value_size; } - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); locked = false; while (node_to_free) { @@ -2354,7 +2319,7 @@ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, - offsetof(struct htab_elem, key) + map->key_size); + offsetof(struct htab_elem, key) + roundup(map->key_size, 8)); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 672abe111282..e3a2662f4e33 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1758,8 +1758,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, + u32 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1792,6 +1792,12 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern } } +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) +{ + return __bpf_dynptr_read(dst, len, src, offset, flags); +} + static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, @@ -1803,8 +1809,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, + u32 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1842,6 +1848,12 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v } } +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) +{ + return __bpf_dynptr_write(dst, offset, src, len, flags); +} + static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, @@ -2043,6 +2055,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_pt_regs_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); + case BPF_FUNC_perf_event_read_value: + return bpf_get_perf_event_read_value_proto(); default: return NULL; } @@ -2757,6 +2771,61 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, return 0; } +/** + * bpf_dynptr_copy() - Copy data from one dynptr to another. + * @dst_ptr: Destination dynptr - where data should be copied to + * @dst_off: Offset into the destination dynptr + * @src_ptr: Source dynptr - where data should be copied from + * @src_off: Offset into the source dynptr + * @size: Length of the data to copy from source to destination + * + * Copies data from source dynptr to destination dynptr. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, + struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +{ + struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + void *src_slice, *dst_slice; + char buf[256]; + u32 off; + + src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); + dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); + + if (src_slice && dst_slice) { + memmove(dst_slice, src_slice, size); + return 0; + } + + if (src_slice) + return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); + + if (dst_slice) + return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); + + if (bpf_dynptr_check_off_len(dst, dst_off, size) || + bpf_dynptr_check_off_len(src, src_off, size)) + return -E2BIG; + + off = 0; + while (off < size) { + u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + int err; + + err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); + if (err) + return err; + err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); + if (err) + return err; + + off += chunk_sz; + } + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -3066,6 +3135,50 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user return ret + 1; } +/** + * bpf_copy_from_user_task_str() - Copy a string from an task's address space + * @dst: Destination address, in kernel space. This buffer must be + * at least @dst__sz bytes long. + * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. + * @unsafe_ptr__ign: Source address in the task's address space. + * @tsk: The task whose address space will be used + * @flags: The only supported flag is BPF_F_PAD_ZEROS + * + * Copies a NUL terminated string from a task's address space to @dst__sz + * buffer. If user string is too long this will still ensure zero termination + * in the @dst__sz buffer unless buffer size is 0. + * + * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success + * and memset all of @dst__sz on failure. + * + * Return: The number of copied bytes on success including the NUL terminator. + * A negative error code on failure. + */ +__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz, + const void __user *unsafe_ptr__ign, + struct task_struct *tsk, u64 flags) +{ + int ret; + + if (unlikely(flags & ~BPF_F_PAD_ZEROS)) + return -EINVAL; + + if (unlikely(dst__sz == 0)) + return 0; + + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0); + if (ret < 0) { + if (flags & BPF_F_PAD_ZEROS) + memset(dst, 0, dst__sz); + return ret; + } + + if (flags & BPF_F_PAD_ZEROS) + memset(dst + ret, 0, dst__sz - ret); + + return ret + 1; +} + /* Keep unsinged long in prototype so that kfunc is usable when emitted to * vmlinux.h in BPF programs directly, but note that while in BPF prog, the * unsigned long always points to 8-byte region on stack, the kernel may only @@ -3161,6 +3274,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_dynptr_copy) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif @@ -3173,6 +3287,7 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e8a772e64324..be66d7e520e0 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -15,6 +15,7 @@ #include <net/ipv6.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <asm/rqspinlock.h> #include <linux/bpf_mem_alloc.h> /* Intermediate node */ @@ -36,7 +37,7 @@ struct lpm_trie { size_t n_entries; size_t max_prefixlen; size_t data_size; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -342,7 +343,9 @@ static long trie_update_elem(struct bpf_map *map, if (!new_node) return -ENOMEM; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); + if (ret) + goto out_free; new_node->prefixlen = key->prefixlen; RCU_INIT_POINTER(new_node->child[0], NULL); @@ -356,8 +359,7 @@ static long trie_update_elem(struct bpf_map *map, */ slot = &trie->root; - while ((node = rcu_dereference_protected(*slot, - lockdep_is_held(&trie->lock)))) { + while ((node = rcu_dereference(*slot))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -442,8 +444,8 @@ static long trie_update_elem(struct bpf_map *map, rcu_assign_pointer(*slot, im_node); out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); +out_free: if (ret) bpf_mem_cache_free(&trie->ma, new_node); bpf_mem_cache_free_rcu(&trie->ma, free_node); @@ -467,7 +469,9 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); + if (ret) + return ret; /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -478,8 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) trim = &trie->root; trim2 = trim; parent = NULL; - while ((node = rcu_dereference_protected( - *trim, lockdep_is_held(&trie->lock)))) { + while ((node = rcu_dereference(*trim))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -543,7 +546,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) free_node = node; out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); bpf_mem_cache_free_rcu(&trie->ma, free_parent); bpf_mem_cache_free_rcu(&trie->ma, free_node); @@ -592,7 +595,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; - raw_spin_lock_init(&trie->lock); + raw_res_spin_lock_init(&trie->lock); /* Allocate intermediate and leaf nodes from the same allocator */ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size + diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 034cf87b54e9..632762b57299 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -14,11 +14,9 @@ int pcpu_freelist_init(struct pcpu_freelist *s) for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); - raw_spin_lock_init(&head->lock); + raw_res_spin_lock_init(&head->lock); head->first = NULL; } - raw_spin_lock_init(&s->extralist.lock); - s->extralist.first = NULL; return 0; } @@ -34,58 +32,39 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, WRITE_ONCE(head->first, node); } -static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, +static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { - raw_spin_lock(&head->lock); - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); -} - -static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) -{ - if (!raw_spin_trylock(&s->extralist.lock)) + if (raw_res_spin_lock(&head->lock)) return false; - - pcpu_freelist_push_node(&s->extralist, node); - raw_spin_unlock(&s->extralist.lock); + pcpu_freelist_push_node(head, node); + raw_res_spin_unlock(&head->lock); return true; } -static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) +void __pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) { - int cpu, orig_cpu; + struct pcpu_freelist_head *head; + int cpu; - orig_cpu = raw_smp_processor_id(); - while (1) { - for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { - struct pcpu_freelist_head *head; + if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) + return; + while (true) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { + if (cpu == raw_smp_processor_id()) + continue; head = per_cpu_ptr(s->freelist, cpu); - if (raw_spin_trylock(&head->lock)) { - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); - return; - } - } - - /* cannot lock any per cpu lock, try extralist */ - if (pcpu_freelist_try_push_extra(s, node)) + if (raw_res_spin_lock(&head->lock)) + continue; + pcpu_freelist_push_node(head, node); + raw_res_spin_unlock(&head->lock); return; + } } } -void __pcpu_freelist_push(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) -{ - if (in_nmi()) - ___pcpu_freelist_push_nmi(s, node); - else - ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); -} - void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { @@ -120,71 +99,29 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { + struct pcpu_freelist_node *node = NULL; struct pcpu_freelist_head *head; - struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; - raw_spin_lock(&head->lock); + if (raw_res_spin_lock(&head->lock)) + continue; node = head->first; if (node) { WRITE_ONCE(head->first, node->next); - raw_spin_unlock(&head->lock); + raw_res_spin_unlock(&head->lock); return node; } - raw_spin_unlock(&head->lock); + raw_res_spin_unlock(&head->lock); } - - /* per cpu lists are all empty, try extralist */ - if (!READ_ONCE(s->extralist.first)) - return NULL; - raw_spin_lock(&s->extralist.lock); - node = s->extralist.first; - if (node) - WRITE_ONCE(s->extralist.first, node->next); - raw_spin_unlock(&s->extralist.lock); - return node; -} - -static struct pcpu_freelist_node * -___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) -{ - struct pcpu_freelist_head *head; - struct pcpu_freelist_node *node; - int cpu; - - for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { - head = per_cpu_ptr(s->freelist, cpu); - if (!READ_ONCE(head->first)) - continue; - if (raw_spin_trylock(&head->lock)) { - node = head->first; - if (node) { - WRITE_ONCE(head->first, node->next); - raw_spin_unlock(&head->lock); - return node; - } - raw_spin_unlock(&head->lock); - } - } - - /* cannot pop from per cpu lists, try extralist */ - if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) - return NULL; - node = s->extralist.first; - if (node) - WRITE_ONCE(s->extralist.first, node->next); - raw_spin_unlock(&s->extralist.lock); return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { - if (in_nmi()) - return ___pcpu_freelist_pop_nmi(s); return ___pcpu_freelist_pop(s); } diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index 3c76553cfe57..914798b74967 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -5,15 +5,15 @@ #define __PERCPU_FREELIST_H__ #include <linux/spinlock.h> #include <linux/percpu.h> +#include <asm/rqspinlock.h> struct pcpu_freelist_head { struct pcpu_freelist_node *first; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct pcpu_freelist { struct pcpu_freelist_head __percpu *freelist; - struct pcpu_freelist_head extralist; }; struct pcpu_freelist_node { diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 0c63bc2cd895..2fdf3c978db1 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -90,3 +90,4 @@ static void __exit fini(void) late_initcall(load); module_exit(fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c new file mode 100644 index 000000000000..b896c4a75a5c --- /dev/null +++ b/kernel/bpf/rqspinlock.c @@ -0,0 +1,737 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Resilient Queued Spin Lock + * + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. + * (C) Copyright 2015 Intel Corp. + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Waiman Long <longman@redhat.com> + * Peter Zijlstra <peterz@infradead.org> + * Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ + +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <linux/prefetch.h> +#include <asm/byteorder.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#endif +#include <trace/events/lock.h> +#include <asm/rqspinlock.h> +#include <linux/timekeeping.h> + +/* + * Include queued spinlock definitions and statistics code + */ +#ifdef CONFIG_QUEUED_SPINLOCKS +#include "../locking/qspinlock.h" +#include "../locking/lock_events.h" +#include "rqspinlock.h" +#include "../locking/mcs_spinlock.h" +#endif + +/* + * The basic principle of a queue-based spinlock can best be understood + * by studying a classic queue-based spinlock implementation called the + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and + * Scott") is available at + * + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 + * + * This queued spinlock implementation is based on the MCS lock, however to + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its + * existing API, we must modify it somehow. + * + * In particular; where the traditional MCS lock consists of a tail pointer + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to + * unlock the next pending (next->locked), we compress both these: {tail, + * next->locked} into a single u32 value. + * + * Since a spinlock disables recursion of its own context and there is a limit + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now + * we can encode the tail by combining the 2-bit nesting level with the cpu + * number. With one byte for the lock value and 3 bytes for the tail, only a + * 32-bit word is now needed. Even though we only need 1 bit for the lock, + * we extend it to a full byte to achieve better performance for architectures + * that support atomic byte write. + * + * We also change the first spinner to spin on the lock bit instead of its + * node; whereby avoiding the need to carry a node from lock to unlock, and + * preserving existing lock API. This also makes the unlock code simpler and + * faster. + * + * N.B. The current implementation only supports architectures that allow + * atomic operations on smaller 8-bit and 16-bit data types. + * + */ + +struct rqspinlock_timeout { + u64 timeout_end; + u64 duration; + u64 cur; + u16 spin; +}; + +#define RES_TIMEOUT_VAL 2 + +DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); +EXPORT_SYMBOL_GPL(rqspinlock_held_locks); + +static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) +{ + if (!(atomic_read_acquire(&lock->val) & (mask))) + return true; + return false; +} + +static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + int cnt = min(RES_NR_HELD, rqh->cnt); + + /* + * Return an error if we hold the lock we are attempting to acquire. + * We'll iterate over max 32 locks; no need to do is_lock_released. + */ + for (int i = 0; i < cnt - 1; i++) { + if (rqh->locks[i] == lock) + return -EDEADLK; + } + return 0; +} + +/* + * This focuses on the most common case of ABBA deadlocks (or ABBA involving + * more locks, which reduce to ABBA). This is not exhaustive, and we rely on + * timeouts as the final line of defense. + */ +static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + int rqh_cnt = min(RES_NR_HELD, rqh->cnt); + void *remote_lock; + int cpu; + + /* + * Find the CPU holding the lock that we want to acquire. If there is a + * deadlock scenario, we will read a stable set on the remote CPU and + * find the target. This would be a constant time operation instead of + * O(NR_CPUS) if we could determine the owning CPU from a lock value, but + * that requires increasing the size of the lock word. + */ + for_each_possible_cpu(cpu) { + struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); + int real_cnt = READ_ONCE(rqh_cpu->cnt); + int cnt = min(RES_NR_HELD, real_cnt); + + /* + * Let's ensure to break out of this loop if the lock is available for + * us to potentially acquire. + */ + if (is_lock_released(lock, mask, ts)) + return 0; + + /* + * Skip ourselves, and CPUs whose count is less than 2, as they need at + * least one held lock and one acquisition attempt (reflected as top + * most entry) to participate in an ABBA deadlock. + * + * If cnt is more than RES_NR_HELD, it means the current lock being + * acquired won't appear in the table, and other locks in the table are + * already held, so we can't determine ABBA. + */ + if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) + continue; + + /* + * Obtain the entry at the top, this corresponds to the lock the + * remote CPU is attempting to acquire in a deadlock situation, + * and would be one of the locks we hold on the current CPU. + */ + remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); + /* + * If it is NULL, we've raced and cannot determine a deadlock + * conclusively, skip this CPU. + */ + if (!remote_lock) + continue; + /* + * Find if the lock we're attempting to acquire is held by this CPU. + * Don't consider the topmost entry, as that must be the latest lock + * being held or acquired. For a deadlock, the target CPU must also + * attempt to acquire a lock we hold, so for this search only 'cnt - 1' + * entries are important. + */ + for (int i = 0; i < cnt - 1; i++) { + if (READ_ONCE(rqh_cpu->locks[i]) != lock) + continue; + /* + * We found our lock as held on the remote CPU. Is the + * acquisition attempt on the remote CPU for a lock held + * by us? If so, we have a deadlock situation, and need + * to recover. + */ + for (int i = 0; i < rqh_cnt - 1; i++) { + if (rqh->locks[i] == remote_lock) + return -EDEADLK; + } + /* + * Inconclusive; retry again later. + */ + return 0; + } + } + return 0; +} + +static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + int ret; + + ret = check_deadlock_AA(lock, mask, ts); + if (ret) + return ret; + ret = check_deadlock_ABBA(lock, mask, ts); + if (ret) + return ret; + + return 0; +} + +static noinline int check_timeout(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + u64 time = ktime_get_mono_fast_ns(); + u64 prev = ts->cur; + + if (!ts->timeout_end) { + ts->cur = time; + ts->timeout_end = time + ts->duration; + return 0; + } + + if (time > ts->timeout_end) + return -ETIMEDOUT; + + /* + * A millisecond interval passed from last time? Trigger deadlock + * checks. + */ + if (prev + NSEC_PER_MSEC < time) { + ts->cur = time; + return check_deadlock(lock, mask, ts); + } + + return 0; +} + +/* + * Do not amortize with spins when res_smp_cond_load_acquire is defined, + * as the macro does internal amortization for us. + */ +#ifndef res_smp_cond_load_acquire +#define RES_CHECK_TIMEOUT(ts, ret, mask) \ + ({ \ + if (!(ts).spin++) \ + (ret) = check_timeout((lock), (mask), &(ts)); \ + (ret); \ + }) +#else +#define RES_CHECK_TIMEOUT(ts, ret, mask) \ + ({ (ret) = check_timeout(&(ts)); }) +#endif + +/* + * Initialize the 'spin' member. + * Set spin member to 0 to trigger AA/ABBA checks immediately. + */ +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) + +/* + * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. + * Duration is defined for each spin attempt, so set it here. + */ +#define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) + +/* + * Provide a test-and-set fallback for cases when queued spin lock support is + * absent from the architecture. + */ +int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) +{ + struct rqspinlock_timeout ts; + int val, ret = 0; + + RES_INIT_TIMEOUT(ts); + grab_held_lock_entry(lock); + + /* + * Since the waiting loop's time is dependent on the amount of + * contention, a short timeout unlike rqspinlock waiting loops + * isn't enough. Choose a second as the timeout value. + */ + RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); +retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) + goto out; + cpu_relax(); + goto retry; + } + + return 0; +out: + release_held_lock_entry(); + return ret; +} +EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); + +#ifdef CONFIG_QUEUED_SPINLOCKS + +/* + * Per-CPU queue node structures; we can never have more than 4 nested + * contexts: task, softirq, hardirq, nmi. + * + * Exactly fits one 64-byte cacheline on a 64-bit architecture. + */ +static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); + +#ifndef res_smp_cond_load_acquire +#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) +#endif + +#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) + +/** + * resilient_queued_spin_lock_slowpath - acquire the queued spinlock + * @lock: Pointer to queued spinlock structure + * @val: Current value of the queued spinlock 32-bit word + * + * Return: + * * 0 - Lock was acquired successfully. + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. + * * -ETIMEDOUT - Lock acquisition failed because of timeout. + * + * (queue tail, pending bit, lock value) + * + * fast : slow : unlock + * : : + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) + * : | ^--------.------. / : + * : v \ \ | : + * pending : (0,1,1) +--> (0,1,0) \ | : + * : | ^--' | | : + * : v | | : + * uncontended : (n,x,y) +--> (n,0,0) --' | : + * queue : | ^--' | : + * : v | : + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : + * queue : ^--' : + */ +int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) +{ + struct mcs_spinlock *prev, *next, *node; + struct rqspinlock_timeout ts; + int idx, ret = 0; + u32 old, tail; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + if (resilient_virt_spin_lock_enabled()) + return resilient_virt_spin_lock(lock); + + RES_INIT_TIMEOUT(ts); + + /* + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); + } + + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* + * trylock || pending + * + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock + */ + val = queued_fetch_set_pending_acquire(lock); + + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; + } + + /* + * Grab an entry in the held locks array, to enable deadlock detection. + */ + grab_held_lock_entry(lock); + + /* + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) { + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); + res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); + } + + if (ret) { + /* + * We waited for the locked bit to go back to 0, as the pending + * waiter, but timed out. We need to clear the pending bit since + * we own it. Once a stuck owner has been recovered, the lock + * must be restored to a valid state, hence removing the pending + * bit is necessary. + * + * *,1,* -> *,0,* + */ + clear_pending(lock); + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_entry; + } + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 + */ + clear_pending_set_locked(lock); + lockevent_inc(lock_pending); + return 0; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + lockevent_inc(lock_slowpath); + /* + * Grab deadlock detection entry for the queue path. + */ + grab_held_lock_entry(lock); + + node = this_cpu_ptr(&rqnodes[0].mcs); + idx = node->count++; + tail = encode_tail(smp_processor_id(), idx); + + trace_contention_begin(lock, LCB_F_SPIN); + + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + lockevent_inc(lock_no_node); + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); + while (!queued_spin_trylock(lock)) { + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_node; + } + cpu_relax(); + } + goto release; + } + + node = grab_mcs_node(node, idx); + + /* + * Keep counts of non-zero index values: + */ + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + + node->locked = 0; + node->next = NULL; + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (queued_spin_trylock(lock)) + goto release; + + /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + next = NULL; + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + int val; + + prev = decode_tail(old, rqnodes); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + val = arch_mcs_spin_lock_contended(&node->locked); + if (val == RES_TIMEOUT_VAL) { + ret = -EDEADLK; + goto waitq_timeout; + } + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We optimistically load + * the next pointer & prefetch the cacheline for writing + * to reduce latency in the upcoming MCS unlock operation. + */ + next = READ_ONCE(node->next); + if (next) + prefetchw(next); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + * + * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is + * meant to span maximum allowed time per critical section, and we may + * have both the owner of the lock and the pending bit waiter ahead of + * us. + */ + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); + val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || + RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); + +waitq_timeout: + if (ret) { + /* + * If the tail is still pointing to us, then we are the final waiter, + * and are responsible for resetting the tail back to 0. Otherwise, if + * the cmpxchg operation fails, we signal the next waiter to take exit + * and try the same. For a waiter with tail node 'n': + * + * n,*,* -> 0,*,* + * + * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is + * possible locked/pending bits keep changing and we see failures even + * when we remain the head of wait queue. However, eventually, + * pending bit owner will unset the pending bit, and new waiters + * will queue behind us. This will leave the lock owner in + * charge, and it will eventually either set locked bit to 0, or + * leave it as 1, allowing us to make progress. + * + * We terminate the whole wait queue for two reasons. Firstly, + * we eschew per-waiter timeouts with one applied at the head of + * the wait queue. This allows everyone to break out faster + * once we've seen the owner / pending waiter not responding for + * the timeout duration from the head. Secondly, it avoids + * complicated synchronization, because when not leaving in FIFO + * order, prev's next pointer needs to be fixed up etc. + */ + if (!try_cmpxchg_tail(lock, tail, 0)) { + next = smp_cond_load_relaxed(&node->next, VAL); + WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); + } + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_node; + } + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,*,0 -> *,*,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. + */ + + /* + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } + + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* + * contended path; wait for next if not observed yet, release. + */ + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); + + arch_mcs_spin_unlock_contended(&next->locked); + +release: + trace_contention_end(lock, 0); + + /* + * release the node + */ + __this_cpu_dec(rqnodes[0].mcs.count); + return ret; +err_release_node: + trace_contention_end(lock, ret); + __this_cpu_dec(rqnodes[0].mcs.count); +err_release_entry: + release_held_lock_entry(); + return ret; +} +EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); + +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) +{ + int ret; + + BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); + BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); + + preempt_disable(); + ret = res_spin_lock((rqspinlock_t *)lock); + if (unlikely(ret)) { + preempt_enable(); + return ret; + } + return 0; +} + +__bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) +{ + res_spin_unlock((rqspinlock_t *)lock); + preempt_enable(); +} + +__bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) +{ + u64 *ptr = (u64 *)flags__irq_flag; + unsigned long flags; + int ret; + + preempt_disable(); + local_irq_save(flags); + ret = res_spin_lock((rqspinlock_t *)lock); + if (unlikely(ret)) { + local_irq_restore(flags); + preempt_enable(); + return ret; + } + *ptr = flags; + return 0; +} + +__bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) +{ + u64 *ptr = (u64 *)flags__irq_flag; + unsigned long flags = *ptr; + + res_spin_unlock((rqspinlock_t *)lock); + local_irq_restore(flags); + preempt_enable(); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(rqspinlock_kfunc_ids) +BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_res_spin_unlock) +BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) +BTF_KFUNCS_END(rqspinlock_kfunc_ids) + +static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { + .owner = THIS_MODULE, + .set = &rqspinlock_kfunc_ids, +}; + +static __init int rqspinlock_register_kfuncs(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); +} +late_initcall(rqspinlock_register_kfuncs); diff --git a/kernel/bpf/rqspinlock.h b/kernel/bpf/rqspinlock.h new file mode 100644 index 000000000000..5d8cb1b1aab4 --- /dev/null +++ b/kernel/bpf/rqspinlock.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Resilient Queued Spin Lock defines + * + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ +#ifndef __LINUX_RQSPINLOCK_H +#define __LINUX_RQSPINLOCK_H + +#include "../locking/qspinlock.h" + +/* + * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value + * @lock: Pointer to queued spinlock structure + * @tail: The tail to compare against + * @new_tail: The new queue tail code word + * Return: Bool to indicate whether the cmpxchg operation succeeded + * + * This is used by the head of the wait queue to clean up the queue. + * Provides relaxed ordering, since observers only rely on initialized + * state of the node which was made visible through the xchg_tail operation, + * i.e. through the smp_wmb preceding xchg_tail. + * + * We avoid using 16-bit cmpxchg, which is not available on all architectures. + */ +static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + /* + * Is the tail part we compare to already stale? Fail. + */ + if ((old & _Q_TAIL_MASK) != tail) + return false; + /* + * Encode latest locked/pending state for new tail. + */ + new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return true; +} + +#endif /* __LINUX_RQSPINLOCK_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e1e42e918ba7..9794446bc8c6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +static bool can_alloc_pages(void) +{ + return preempt_count() == 0 && !irqs_disabled() && + !IS_ENABLED(CONFIG_PREEMPT_RT); +} + +static struct page *__bpf_alloc_page(int nid) +{ + if (!can_alloc_pages()) + return try_alloc_pages(nid, 0); + + return alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT + | __GFP_NOWARN, + 0); +} + +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; @@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + pg = __bpf_alloc_page(nid); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) - __free_page(pages[j]); + free_pages_nolock(pages[j], 0); ret = -ENOMEM; break; } @@ -648,6 +665,7 @@ void btf_record_free(struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: @@ -700,6 +718,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: @@ -777,6 +796,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) switch (fields[i].type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: break; case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); @@ -1212,7 +1232,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, return -EINVAL; map->record = btf_parse_fields(btf, value_type, - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | + BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { @@ -1231,6 +1251,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case 0: continue; case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && @@ -1315,7 +1336,7 @@ static bool bpf_net_capable(void) #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ -static int map_create(union bpf_attr *attr) +static int map_create(union bpf_attr *attr, bool kernel) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1505,7 +1526,7 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token); + err = security_bpf_map_create(map, attr, token, kernel); if (err) goto free_map_sec; @@ -1593,11 +1614,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { - spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, false); - spin_unlock_bh(&map_idr_lock); - - return map; + lockdep_assert(rcu_read_lock_held()); + return __bpf_map_inc_not_zero(map, false); } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); @@ -2314,6 +2332,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); + kfree(prog->aux->ctx_arg_info); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); @@ -2944,7 +2963,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_prog; - err = security_bpf_prog_load(prog, attr, token); + err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) goto free_prog_sec; @@ -4169,7 +4188,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PREORDER) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ @@ -4733,6 +4753,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; + if (prog->aux->btf) + info.btf_id = btf_obj_id(prog->aux->btf); if (!bpf_capable()) { info.jited_prog_len = 0; @@ -4879,8 +4901,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) - info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); @@ -5121,15 +5141,34 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ return btf_new_fd(attr, uattr, uattr_size); } -#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) { + struct bpf_token *token = NULL; + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (attr->open_flags & ~BPF_F_TOKEN_FD) + return -EINVAL; + + if (attr->open_flags & BPF_F_TOKEN_FD) { + token = bpf_token_get_from_fd(attr->fd_by_id_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) { + bpf_token_put(token); + token = NULL; + } + } + + if (!bpf_token_capable(token, CAP_SYS_ADMIN)) { + bpf_token_put(token); return -EPERM; + } + + bpf_token_put(token); return btf_get_fd_by_id(attr->btf_id); } @@ -5768,13 +5807,13 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; - err = security_bpf(cmd, &attr, size); + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr); + err = map_create(&attr, uattr.is_kernel); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6e604caa870c..54c6953a8b84 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -456,7 +456,7 @@ static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { - return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -579,6 +579,13 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } +static bool is_atomic_load_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ; +} + static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -1148,7 +1155,8 @@ static int release_irq_state(struct bpf_verifier_state *state, int id); static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, - struct bpf_reg_state *reg, int insn_idx) + struct bpf_reg_state *reg, int insn_idx, + int kfunc_class) { struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; @@ -1170,6 +1178,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; + st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_IRQ_FLAG; @@ -1178,7 +1187,8 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, return 0; } -static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int kfunc_class) { struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; @@ -1192,6 +1202,15 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; + if (st->irq.kfunc_class != kfunc_class) { + const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + + verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n", + flag_kfunc, used_kfunc); + return -EINVAL; + } + err = release_irq_state(env->cur_state, st->ref_obj_id); WARN_ON_ONCE(err && err != -EACCES); if (err) { @@ -1409,6 +1428,8 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->active_preempt_locks = src->active_preempt_locks; dst->active_rcu_lock = src->active_rcu_lock; dst->active_irq_id = src->active_irq_id; + dst->active_lock_id = src->active_lock_id; + dst->active_lock_ptr = src->active_lock_ptr; return 0; } @@ -1508,6 +1529,8 @@ static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r s->ptr = ptr; state->active_locks++; + state->active_lock_id = id; + state->active_lock_ptr = ptr; return 0; } @@ -1545,18 +1568,37 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } +static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +{ + int i; + + for (i = 0; i < state->acquired_refs; i++) + if (state->refs[i].id == ptr_id) + return true; + + return false; +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { + void *prev_ptr = NULL; + u32 prev_id = 0; int i; for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].type != type) - continue; - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + if (state->refs[i].type == type && state->refs[i].id == id && + state->refs[i].ptr == ptr) { release_reference_state(state, i); state->active_locks--; + /* Reassign active lock (id, ptr). */ + state->active_lock_id = prev_id; + state->active_lock_ptr = prev_ptr; return 0; } + if (state->refs[i].type & REF_TYPE_LOCK_MASK) { + prev_id = state->refs[i].id; + prev_ptr = state->refs[i].ptr; + } } return -EINVAL; } @@ -1591,7 +1633,7 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st for (i = 0; i < state->acquired_refs; i++) { struct bpf_reference_state *s = &state->refs[i]; - if (s->type != type) + if (!(s->type & type)) continue; if (s->id == id && s->ptr == ptr) @@ -1600,6 +1642,14 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st return NULL; } +static void update_peak_states(struct bpf_verifier_env *env) +{ + u32 cur_states; + + cur_states = env->explored_states_size + env->free_list_size; + env->peak_states = max(env->peak_states, cur_states); +} + static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -1622,6 +1672,50 @@ static void free_verifier_state(struct bpf_verifier_state *state, kfree(state); } +/* struct bpf_verifier_state->{parent,loop_entry} refer to states + * that are in either of env->{expored_states,free_list}. + * In both cases the state is contained in struct bpf_verifier_state_list. + */ +static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st) +{ + if (st->parent) + return container_of(st->parent, struct bpf_verifier_state_list, state); + return NULL; +} + +static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st) +{ + if (st->loop_entry) + return container_of(st->loop_entry, struct bpf_verifier_state_list, state); + return NULL; +} + +/* A state can be freed if it is no longer referenced: + * - is in the env->free_list; + * - has no children states; + * - is not used as loop_entry. + * + * Freeing a state can make it's loop_entry free-able. + */ +static void maybe_free_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state_list *sl) +{ + struct bpf_verifier_state_list *loop_entry_sl; + + while (sl && sl->in_free_list && + sl->state.branches == 0 && + sl->state.used_as_loop_entry == 0) { + loop_entry_sl = state_loop_entry_as_list(&sl->state); + if (loop_entry_sl) + loop_entry_sl->state.used_as_loop_entry--; + list_del(&sl->node); + free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; + sl = loop_entry_sl; + } +} + /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ @@ -1661,6 +1755,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; + dst_state->loop_entry = src->loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1681,7 +1776,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env) return env->prog->len; } -static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx) +static struct list_head *explored_state(struct bpf_verifier_env *env, int idx) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_func_state *state = cur->frame[cur->curframe]; @@ -1789,16 +1884,13 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * # Find outermost loop entry known for n * def get_loop_entry(n): * h = entries.get(n, None) - * while h in entries and entries[h] != h: + * while h in entries: * h = entries[h] * return h * - * # Update n's loop entry if h's outermost entry comes - * # before n's outermost entry in current DFS path. + * # Update n's loop entry if h comes before n in current DFS path. * def update_loop_entry(n, h): - * n1 = get_loop_entry(n) or n - * h1 = get_loop_entry(h) or h - * if h1 in path and depths[h1] <= depths[n1]: + * if h in path and depths[entries.get(n, n)] < depths[n]: * entries[n] = h1 * * def dfs(n, depth): @@ -1810,7 +1902,7 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * # Case A: explore succ and update cur's loop entry * # only if succ's entry is in current DFS path. * dfs(succ, depth + 1) - * h = get_loop_entry(succ) + * h = entries.get(succ, None) * update_loop_entry(n, h) * else: * # Case B or C depending on `h1 in path` check in update_loop_entry(). @@ -1822,46 +1914,49 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * and cur's loop entry has to be updated (case A), handle this in * update_branch_counts(); * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(); - * - update topmost loop entry for intermediate states in get_loop_entry(). + * - handle cases B and C in is_state_visited(). */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st) +static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) { - struct bpf_verifier_state *topmost = st->loop_entry, *old; + struct bpf_verifier_state *topmost = st->loop_entry; + u32 steps = 0; - while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) + while (topmost && topmost->loop_entry) { + if (steps++ > st->dfs_depth) { + WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n"); + verbose(env, "verifier bug: infinite loop in get_loop_entry()\n"); + return ERR_PTR(-EFAULT); + } topmost = topmost->loop_entry; - /* Update loop entries for intermediate states to avoid this - * traversal in future get_loop_entry() calls. - */ - while (st && st->loop_entry != topmost) { - old = st->loop_entry; - st->loop_entry = topmost; - st = old; } return topmost; } -static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +static void update_loop_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) { - struct bpf_verifier_state *cur1, *hdr1; - - cur1 = get_loop_entry(cur) ?: cur; - hdr1 = get_loop_entry(hdr) ?: hdr; - /* The head1->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr1->branches == 0 then + /* The hdr->branches check decides between cases B and C in + * comment for get_loop_entry(). If hdr->branches == 0 then * head's topmost loop entry is not in current DFS path, * hence 'cur' and 'hdr' are not in the same loop and there is * no need to update cur->loop_entry. */ - if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) { + if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) { + if (cur->loop_entry) { + cur->loop_entry->used_as_loop_entry--; + maybe_free_verifier_state(env, state_loop_entry_as_list(cur)); + } cur->loop_entry = hdr; - hdr->used_as_loop_entry = true; + hdr->used_as_loop_entry++; } } static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { + struct bpf_verifier_state_list *sl = NULL, *parent_sl; + struct bpf_verifier_state *parent; + while (st) { u32 br = --st->branches; @@ -1871,7 +1966,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi * This is a part of 'case A' in get_loop_entry() comment. */ if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(st->parent, st->loop_entry); + update_loop_entry(env, st->parent, st->loop_entry); /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: @@ -1881,7 +1976,12 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi br); if (br) break; - st = st->parent; + parent = st->parent; + parent_sl = state_parent_as_list(st); + if (sl) + maybe_free_verifier_state(env, sl); + st = parent; + sl = parent_sl; } } @@ -3206,6 +3306,21 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, return res ? &res->func_model : NULL; } +static int add_kfunc_in_insns(struct bpf_verifier_env *env, + struct bpf_insn *insn, int cnt) +{ + int i, ret; + + for (i = 0; i < cnt; i++, insn++) { + if (bpf_pseudo_kfunc_call(insn)) { + ret = add_kfunc_call(env, insn->imm, insn->off); + if (ret < 0) + return ret; + } + } + return 0; +} + static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; @@ -3269,6 +3384,15 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } +static int jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3295,10 +3419,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - if (code == (BPF_JMP32 | BPF_JA)) - off = i + insn[i].imm + 1; - else - off = i + insn[i].off + 1; + off = i + jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3483,7 +3604,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - /* BPF_STX (including atomic variants) has multiple source + /* BPF_STX (including atomic variants) has one or more source * operands, one of which is a ptr. Check whether the caller is * asking about it. */ @@ -3828,6 +3949,17 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } +static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, + .cb_print = verbose, + .private_data = env, + }; + + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); +} + static inline void bt_init(struct backtrack_state *bt, u32 frame) { bt->frame = frame; @@ -4028,11 +4160,6 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); u8 opcode = BPF_OP(insn->code); @@ -4050,7 +4177,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); } /* If there is a history record that some registers gained range at this insn, @@ -4097,7 +4224,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * dreg still needs precision before this insn */ } - } else if (class == BPF_LDX) { + } else if (class == BPF_LDX || is_atomic_load_insn(insn)) { if (!bt_is_reg_set(bt, dreg)) return 0; bt_clear_reg(bt, dreg); @@ -5982,18 +6109,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, - enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx) + enum bpf_access_type t, struct bpf_insn_access_aux *info) { - struct bpf_insn_access_aux info = { - .reg_type = *reg_type, - .log = &env->log, - .is_retval = false, - .is_ldsx = is_ldsx, - }; - if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, env->prog, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -6001,14 +6120,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - *reg_type = info.reg_type; - *is_retval = info.is_retval; - - if (base_type(*reg_type) == PTR_TO_BTF_ID) { - *btf = info.btf; - *btf_id = info.btf_id; + if (base_type(info->reg_type) == PTR_TO_BTF_ID) { + if (info->ref_obj_id && + !find_reference_state(env->cur_state, info->ref_obj_id)) { + verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", + off); + return -EACCES; + } } else { - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size; } /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -6118,6 +6238,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_ARENA; } +/* Return false if @regno contains a pointer whose type isn't supported for + * atomic instruction @insn. + */ +static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno, + struct bpf_insn *insn) +{ + if (is_ctx_reg(env, regno)) + return false; + if (is_pkt_reg(env, regno)) + return false; + if (is_flow_key_reg(env, regno)) + return false; + if (is_sk_reg(env, regno)) + return false; + if (is_arena_reg(env, regno)) + return bpf_jit_supports_insn(insn, true); + + return true; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -7365,11 +7505,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { - bool is_retval = false; struct bpf_retval_range range; - enum bpf_reg_type reg_type = SCALAR_VALUE; - struct btf *btf = NULL; - u32 btf_id = 0; + struct bpf_insn_access_aux info = { + .reg_type = SCALAR_VALUE, + .is_ldsx = is_ldsx, + .log = &env->log, + }; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -7381,8 +7522,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id, &is_retval, is_ldsx); + err = check_ctx_access(env, insn_idx, off, size, t, &info); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -7390,8 +7530,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) { - if (is_retval && get_func_retval_range(env->prog, &range)) { + if (info.reg_type == SCALAR_VALUE) { + if (info.is_retval && get_func_retval_range(env->prog, &range)) { err = __mark_reg_s32_range(env, regs, value_regno, range.minval, range.maxval); if (err) @@ -7402,7 +7542,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(reg_type)) + if (type_may_be_null(info.reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -7410,12 +7550,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (base_type(reg_type) == PTR_TO_BTF_ID) { - regs[value_regno].btf = btf; - regs[value_regno].btf_id = btf_id; + if (base_type(info.reg_type) == PTR_TO_BTF_ID) { + regs[value_regno].btf = info.btf; + regs[value_regno].btf_id = info.btf_id; + regs[value_regno].ref_obj_id = info.ref_obj_id; } } - regs[value_regno].type = reg_type; + regs[value_regno].type = info.reg_type; } } else if (reg->type == PTR_TO_STACK) { @@ -7518,27 +7659,72 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch); -static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once, bool is_ldsx, + bool allow_trust_mismatch, const char *ctx) { - int load_reg; + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type src_reg_type; int err; - switch (insn->imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - case BPF_AND: - case BPF_AND | BPF_FETCH: - case BPF_OR: - case BPF_OR | BPF_FETCH: - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - case BPF_XCHG: - case BPF_CMPXCHG: - break; - default: - verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); - return -EINVAL; - } + /* check src operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check dst operand */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* Check if (src_reg + off) is readable. The state of dst_reg will be + * updated by this call. + */ + err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, + strict_alignment_once, is_ldsx); + err = err ?: save_aux_ptr_type(env, src_reg_type, + allow_trust_mismatch); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], ctx); + + return err; +} + +static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once) +{ + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type dst_reg_type; + int err; + + /* check src1 operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = regs[insn->dst_reg].type; + + /* Check if (dst_reg + off) is writeable. */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, + strict_alignment_once, false); + err = err ?: save_aux_ptr_type(env, dst_reg_type, false); + + return err; +} + +static int check_atomic_rmw(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int load_reg; + int err; if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { verbose(env, "invalid atomic operand size\n"); @@ -7574,11 +7760,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg) || - (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7605,12 +7787,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, - true, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, load_reg, true, false); if (err) return err; @@ -7620,13 +7802,86 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; return 0; } +static int check_atomic_load(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_load_mem(env, insn, true, false, false, "atomic_load"); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) { + verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n", + insn->src_reg, + reg_type_str(env, reg_state(env, insn->src_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic_store(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_store_reg(env, insn, true); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", + insn->dst_reg, + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + return check_atomic_rmw(env, insn); + case BPF_LOAD_ACQ: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit load-acquires are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_load(env, insn); + case BPF_STORE_REL: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit store-releases are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_store(env, insn); + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", + insn->imm); + return -EINVAL; + } +} + /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending * on the access type and privileges, that all elements of the stack are @@ -7985,6 +8240,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg return err; } +enum { + PROCESS_SPIN_LOCK = (1 << 0), + PROCESS_RES_LOCK = (1 << 1), + PROCESS_LOCK_IRQ = (1 << 2), +}; + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. @@ -8007,30 +8268,33 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, - bool is_lock) +static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) { + bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; + const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); + bool is_irq = flags & PROCESS_LOCK_IRQ; u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec; + u32 spin_lock_off; int err; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", + regno, lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { map = reg->map_ptr; if (!map->btf) { verbose(env, - "map '%s' has to have BTF in order to use bpf_spin_lock\n", - map->name); + "map '%s' has to have BTF in order to use %s_lock\n", + map->name, lock_str); return -EINVAL; } } else { @@ -8038,36 +8302,53 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, } rec = reg_btf_record(reg); - if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { - verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", - map ? map->name : "kptr"); + if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) { + verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local", + map ? map->name : "kptr", lock_str); return -EINVAL; } - if (rec->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", - val + reg->off, rec->spin_lock_off); + spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; + if (spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", + val + reg->off, lock_str, spin_lock_off); return -EINVAL; } if (is_lock) { void *ptr; + int type; if (map) ptr = map; else ptr = btf; - if (cur->active_locks) { - verbose(env, - "Locking two bpf_spin_locks are not allowed\n"); - return -EINVAL; + if (!is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + } else if (is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) { + verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); + return -EINVAL; + } } - err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); + + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); if (err < 0) { verbose(env, "Failed to acquire lock state\n"); return err; } } else { void *ptr; + int type; if (map) ptr = map; @@ -8075,12 +8356,26 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, ptr = btf; if (!cur->active_locks) { - verbose(env, "bpf_spin_unlock without taking a lock\n"); + verbose(env, "%s_unlock without taking a lock\n", lock_str); return -EINVAL; } - if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { - verbose(env, "bpf_spin_unlock of different lock\n"); + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + if (!find_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); + return -EINVAL; + } + if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) { + verbose(env, "%s_unlock cannot be out of order\n", lock_str); + return -EINVAL; + } + if (release_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); return -EINVAL; } @@ -8431,10 +8726,12 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, { struct bpf_verifier_state_list *sl; struct bpf_verifier_state *st; + struct list_head *pos, *head; /* Explored states are pushed in stack order, most recent states come first */ - sl = *explored_state(env, insn_idx); - for (; sl; sl = sl->next) { + head = explored_state(env, insn_idx); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); /* If st->branches != 0 state is a part of current DFS verification path, * hence cur & st for a loop. */ @@ -9404,11 +9701,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, true); + err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, false); + err = process_spin_lock(env, regno, 0); if (err) return err; } else { @@ -9666,7 +9963,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { - verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n"); return -EINVAL; } break; @@ -10237,23 +10534,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); - /* Only global subprogs cannot be called with a lock held. */ if (env->cur_state->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; } - /* Only global subprogs cannot be called with preemption disabled. */ - if (env->cur_state->active_preempt_locks) { - verbose(env, "global function calls are not allowed with preemption disabled,\n" - "use static function instead\n"); - return -EINVAL; - } - - if (env->cur_state->active_irq_id) { - verbose(env, "global function calls are not allowed with IRQs disabled,\n" - "use static function instead\n"); + if (env->subprog_info[subprog].might_sleep && + (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + env->cur_state->active_irq_id || !in_sleepable(env))) { + verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" + "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" + "a non-sleepable BPF program context\n"); return -EINVAL; } @@ -10752,6 +11044,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { struct bpf_verifier_state *state = env->cur_state; + enum bpf_prog_type type = resolve_prog_type(env->prog); + struct bpf_reg_state *reg = reg_state(env, BPF_REG_0); bool refs_lingering = false; int i; @@ -10761,6 +11055,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; + /* Allow struct_ops programs to return a referenced kptr back to + * kernel. Type checks are performed later in check_return_code. + */ + if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && + reg->ref_obj_id == state->refs[i].id) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); refs_lingering = true; @@ -11287,7 +11587,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_flag) && - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -11459,10 +11759,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* mark_btf_func_reg_size() is used when the reg size is determined by * the BTF func_proto's return value size and argument. */ -static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, - size_t reg_size) +static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs, + u32 regno, size_t reg_size) { - struct bpf_reg_state *reg = &cur_regs(env)[regno]; + struct bpf_reg_state *reg = ®s[regno]; if (regno == BPF_REG_0) { /* Function return value */ @@ -11480,6 +11780,12 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size); +} + static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_ACQUIRE; @@ -11617,6 +11923,7 @@ enum { KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, + KF_ARG_RES_SPIN_LOCK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -11626,6 +11933,7 @@ BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) +BTF_ID(struct, bpf_res_spin_lock) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -11674,6 +11982,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -11745,6 +12058,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_MAP, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, + KF_ARG_PTR_TO_RES_SPIN_LOCK, }; enum special_kfunc_type { @@ -11781,6 +12095,12 @@ enum special_kfunc_type { KF_bpf_iter_num_new, KF_bpf_iter_num_next, KF_bpf_iter_num_destroy, + KF_bpf_set_dentry_xattr, + KF_bpf_remove_dentry_xattr, + KF_bpf_res_spin_lock, + KF_bpf_res_spin_unlock, + KF_bpf_res_spin_lock_irqsave, + KF_bpf_res_spin_unlock_irqrestore, }; BTF_SET_START(special_kfunc_set) @@ -11810,6 +12130,10 @@ BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #endif +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_set_dentry_xattr) +BTF_ID(func, bpf_remove_dentry_xattr) +#endif BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -11859,6 +12183,17 @@ BTF_ID(func, bpf_local_irq_restore) BTF_ID(func, bpf_iter_num_new) BTF_ID(func, bpf_iter_num_next) BTF_ID(func, bpf_iter_num_destroy) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_set_dentry_xattr) +BTF_ID(func, bpf_remove_dentry_xattr) +#else +BTF_ID_UNUSED +BTF_ID_UNUSED +#endif +BTF_ID(func, bpf_res_spin_lock) +BTF_ID(func, bpf_res_spin_unlock) +BTF_ID(func, bpf_res_spin_lock_irqsave) +BTF_ID(func, bpf_res_spin_unlock_irqrestore) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11952,6 +12287,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RES_SPIN_LOCK; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -12059,13 +12397,19 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; - int err; - if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) { irq_save = true; - } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + kfunc_class = IRQ_LOCK_KFUNC; + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) { irq_save = false; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + kfunc_class = IRQ_LOCK_KFUNC; } else { verbose(env, "verifier internal error: unknown irq flags kfunc\n"); return -EFAULT; @@ -12081,7 +12425,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class); if (err) return err; } else { @@ -12095,7 +12439,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = unmark_stack_slot_irq_flag(env, reg); + err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) return err; } @@ -12222,7 +12566,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ if (!env->cur_state->active_locks) return -EINVAL; - s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); + s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; @@ -12258,9 +12602,18 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } +static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; +} + static bool kfunc_spin_allowed(u32 btf_id) { - return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || + is_bpf_res_spin_lock_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) @@ -12692,6 +13045,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: case KF_ARG_PTR_TO_IRQ_FLAG: + case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; default: WARN_ON_ONCE(1); @@ -12990,6 +13344,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RES_SPIN_LOCK: + { + int flags = PROCESS_RES_LOCK; + + if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + return -EINVAL; + } + + if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) + return -EFAULT; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + flags |= PROCESS_SPIN_LOCK; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + flags |= PROCESS_LOCK_IRQ; + ret = process_spin_lock(env, regno, flags); + if (ret < 0) + return ret; + break; + } } } @@ -13075,6 +13451,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_aux->is_iter_next = is_iter_next_kfunc(&meta); + if (!insn->off && + (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || + insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (!branch) { + verbose(env, "failed to push state for failed lock acquisition\n"); + return -ENOMEM; + } + + regs = branch->frame[branch->curframe]->regs; + + /* Clear r0-r5 registers in forked state */ + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(env, regs, caller_saved[i]); + + mark_reg_unknown(env, regs, BPF_REG_0); + err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); + if (err) { + verbose(env, "failed to mark s32 range for retval in forked state for lock\n"); + return err; + } + __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); + } + if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); return -EACCES; @@ -13245,6 +13648,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); + if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) + __mark_reg_const_zero(env, ®s[BPF_REG_0]); mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); @@ -16399,13 +16805,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char const char *exit_ctx = "At program exit"; struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; - struct bpf_reg_state *reg; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; bool return_32bit = false; + const struct btf_type *reg_type, *ret_type = NULL; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { @@ -16414,10 +16821,26 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char if (prog->expected_attach_type == BPF_LSM_CGROUP) /* See below, can be 0 or 0-1 depending on hook. */ break; - fallthrough; + if (!prog->aux->attach_func_proto->type) + return 0; + break; case BPF_PROG_TYPE_STRUCT_OPS: if (!prog->aux->attach_func_proto->type) return 0; + + if (frame->in_exception_callback_fn) + break; + + /* Allow a struct_ops program to return a referenced kptr if it + * matches the operator's return type and is in its unmodified + * form. A scalar zero (i.e., a null pointer) is also allowed. + */ + reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL; + ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, + prog->aux->attach_func_proto->type, + NULL); + if (ret_type && ret_type == reg_type && reg->ref_obj_id) + return __check_ptr_off_reg(env, reg, regno, false); break; default: break; @@ -16439,8 +16862,6 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -EACCES; } - reg = cur_regs(env) + regno; - if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ exit_ctx = "At async callback return"; @@ -16539,6 +16960,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_NETFILTER: range = retval_range(NF_DROP, NF_ACCEPT); break; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!ret_type) + return 0; + range = retval_range(0, 0); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -16582,6 +17008,14 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) subprog->changes_pkt_data = true; } +static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->might_sleep = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. @@ -16595,6 +17029,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) caller = find_containing_subprog(env, t); callee = find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; + caller->might_sleep |= callee->might_sleep; } /* non-recursive DFS pseudo code @@ -16753,27 +17188,6 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, /* Bitmask with 1s for all caller saved registers */ #define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) -/* Return a bitmask specifying which caller saved registers are - * clobbered by a call to a helper *as if* this helper follows - * bpf_fastcall contract: - * - includes R0 if function is non-void; - * - includes R1-R5 if corresponding parameter has is described - * in the function prototype. - */ -static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) -{ - u32 mask; - int i; - - mask = 0; - if (fn->ret_type != RET_VOID) - mask |= BIT(BPF_REG_0); - for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) - if (fn->arg_type[i] != ARG_DONTCARE) - mask |= BIT(BPF_REG_1 + i); - return mask; -} - /* True if do_misc_fixups() replaces calls to helper number 'imm', * replacement patch is presumed to follow bpf_fastcall contract * (see mark_fastcall_pattern_for_call() below). @@ -16790,24 +17204,54 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ -static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +struct call_summary { + u8 num_params; + bool is_void; + bool fastcall; +}; + +/* If @call is a kfunc or helper call, fills @cs and returns true, + * otherwise returns false. + */ +static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct call_summary *cs) { - u32 vlen, i, mask; + struct bpf_kfunc_call_arg_meta meta; + const struct bpf_func_proto *fn; + int i; - vlen = btf_type_vlen(meta->func_proto); - mask = 0; - if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) - mask |= BIT(BPF_REG_0); - for (i = 0; i < vlen; ++i) - mask |= BIT(BPF_REG_1 + i); - return mask; -} + if (bpf_helper_call(call)) { -/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ -static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_FASTCALL; + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return false; + cs->fastcall = fn->allow_fastcall && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + cs->is_void = fn->ret_type == RET_VOID; + cs->num_params = 0; + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) { + if (fn->arg_type[i] == ARG_DONTCARE) + break; + cs->num_params++; + } + return true; + } + + if (bpf_pseudo_kfunc_call(call)) { + int err; + + err = fetch_kfunc_meta(env, call, &meta, NULL); + if (err < 0) + /* error would be reported later */ + return false; + cs->num_params = btf_type_vlen(meta.func_proto); + cs->fastcall = meta.kfunc_flags & KF_FASTCALL; + cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type)); + return true; + } + + return false; } /* LLVM define a bpf_fastcall function attribute. @@ -16890,39 +17334,23 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; - const struct bpf_func_proto *fn; - u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 clobbered_regs_mask; + struct call_summary cs; u32 expected_regs_mask; - bool can_be_inlined = false; s16 off; int i; - if (bpf_helper_call(call)) { - if (get_helper_proto(env, call->imm, &fn) < 0) - /* error would be reported later */ - return; - clobbered_regs_mask = helper_fastcall_clobber_mask(fn); - can_be_inlined = fn->allow_fastcall && - (verifier_inlines_helper_call(env, call->imm) || - bpf_jit_inlines_helper_call(call->imm)); - } - - if (bpf_pseudo_kfunc_call(call)) { - struct bpf_kfunc_call_arg_meta meta; - int err; - - err = fetch_kfunc_meta(env, call, &meta, NULL); - if (err < 0) - /* error would be reported later */ - return; - - clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); - can_be_inlined = is_fastcall_kfunc_call(&meta); - } - - if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + if (!get_call_summary(env, call, &cs)) return; + /* A bitmask specifying which caller saved registers are clobbered + * by a call to a helper/kfunc *as if* this helper/kfunc follows + * bpf_fastcall contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ + clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0); /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; @@ -16980,7 +17408,7 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ - if (can_be_inlined) + if (cs.fastcall) env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else subprog->keep_fastcall_stack = 1; @@ -17062,9 +17490,20 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } - if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) - mark_subprog_changes_pkt_data(env, t); - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + if (bpf_helper_call(insn)) { + const struct bpf_func_proto *fp; + + ret = get_helper_proto(env, insn->imm, &fp); + /* If called in a non-sleepable context program will be + * rejected anyway, so we should end up with precise + * sleepable marks on subprogs, except for dead code + * elimination. + */ + if (ret == 0 && fp->might_sleep) + mark_subprog_might_sleep(env, t); + if (bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; ret = fetch_kfunc_meta(env, insn, &meta, NULL); @@ -17083,6 +17522,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ mark_force_checkpoint(env, t); } + /* Same as helpers, if called in a non-sleepable context + * program will be rejected anyway, so we should end up + * with precise sleepable marks on subprogs, except for + * dead code elimination. + */ + if (ret == 0 && is_kfunc_sleepable(&meta)) + mark_subprog_might_sleep(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); @@ -17125,9 +17571,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state; + int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -17139,6 +17584,17 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } + insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_postorder) { + kvfree(insn_state); + kvfree(insn_stack); + return -ENOMEM; + } + + ex_insn_beg = env->exception_callback_subprog + ? env->subprog_info[env->exception_callback_subprog].start + : 0; + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; @@ -17152,6 +17608,7 @@ walk_cfg: case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; + insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17170,13 +17627,10 @@ walk_cfg: goto err_free; } - if (env->exception_callback_subprog && !ex_done) { - ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; - + if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) { insn_state[ex_insn_beg] = DISCOVERED; insn_stack[0] = ex_insn_beg; env->cfg.cur_stack = 1; - ex_done = true; goto walk_cfg; } @@ -17199,6 +17653,7 @@ walk_cfg: } ret = 0; /* cfg looks good */ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; + env->prog->aux->might_sleep = env->subprog_info[0].might_sleep; err_free: kvfree(insn_state); @@ -17815,18 +18270,22 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { + struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; + struct list_head *pos, *head; - sl = *explored_state(env, insn); - while (sl) { + head = explored_state(env, insn); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); if (sl->state.branches) - goto next; + continue; + loop_entry = get_loop_entry(env, &sl->state); + if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches) + continue; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) - goto next; + continue; clean_verifier_state(env, &sl->state); -next: - sl = sl->next; } } @@ -18127,7 +18586,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; case STACK_MISC: @@ -18162,6 +18622,10 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) return false; + if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || + old->active_lock_ptr != cur->active_lock_ptr) + return false; + for (i = 0; i < old->acquired_refs; i++) { if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || old->refs[i].type != cur->refs[i].type) @@ -18171,6 +18635,8 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: + case REF_TYPE_RES_LOCK: + case REF_TYPE_RES_LOCK_IRQ: if (old->refs[i].ptr != cur->refs[i].ptr) return false; break; @@ -18210,15 +18676,17 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, enum exact_level exact) + struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) { - int i; + u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; + u16 i; if (old->callback_depth > cur->callback_depth) return false; for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(env, &old->regs[i], &cur->regs[i], + if (((1 << i) & live_regs) && + !regsafe(env, &old->regs[i], &cur->regs[i], &env->idmap_scratch, exact)) return false; @@ -18239,6 +18707,7 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, enum exact_level exact) { + u32 insn_idx; int i; if (old->curframe != cur->curframe) @@ -18262,9 +18731,12 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { + insn_idx = i == old->curframe + ? env->insn_idx + : old->frame[i + 1]->callsite; if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) + if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) return false; } return true; @@ -18517,10 +18989,11 @@ static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl, **pprev; + struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; bool force_new_state, add_new_state, force_exact; + struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ @@ -18539,15 +19012,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; - pprev = explored_state(env, insn_idx); - sl = *pprev; - clean_live_states(env, insn_idx, cur); - while (sl) { + head = explored_state(env, insn_idx); + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); states_cnt++; if (sl->state.insn_idx != insn_idx) - goto next; + continue; if (sl->state.branches) { struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; @@ -18621,7 +19093,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(cur, &sl->state); + update_loop_entry(env, cur, &sl->state); goto hit; } } @@ -18630,7 +19102,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(cur, &sl->state); + update_loop_entry(env, cur, &sl->state); goto hit; } } @@ -18697,11 +19169,13 @@ skip_inf_loop_check: * * Additional details are in the comment before get_loop_entry(). */ - loop_entry = get_loop_entry(&sl->state); + loop_entry = get_loop_entry(env, &sl->state); + if (IS_ERR(loop_entry)) + return PTR_ERR(loop_entry); force_exact = loop_entry && loop_entry->branches > 0; if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { if (force_exact) - update_loop_entry(cur, loop_entry); + update_loop_entry(env, cur, loop_entry); hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -18750,31 +19224,13 @@ miss: /* the state is unlikely to be useful. Remove it to * speed up verification */ - *pprev = sl->next; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE && - !sl->state.used_as_loop_entry) { - u32 br = sl->state.branches; - - WARN_ONCE(br, - "BUG live_done but branches_to_explore %d\n", - br); - free_verifier_state(&sl->state, false); - kfree(sl); - env->peak_states--; - } else { - /* cannot free this state, since parentage chain may - * walk it later. Add it for free_list instead to - * be freed at the end of verification - */ - sl->next = env->free_list; - env->free_list = sl; - } - sl = *pprev; - continue; + sl->in_free_list = true; + list_del(&sl->node); + list_add(&sl->node, &env->free_list); + env->free_list_size++; + env->explored_states_size--; + maybe_free_verifier_state(env, sl); } -next: - pprev = &sl->next; - sl = *pprev; } if (env->max_states_per_insn < states_cnt) @@ -18799,7 +19255,8 @@ next: if (!new_sl) return -ENOMEM; env->total_states++; - env->peak_states++; + env->explored_states_size++; + update_peak_states(env); env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; @@ -18823,8 +19280,8 @@ next: cur->first_insn_idx = insn_idx; cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; - new_sl->next = *explored_state(env, insn_idx); - *explored_state(env, insn_idx) = new_sl; + list_add(&new_sl->node, head); + /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just @@ -19011,19 +19468,13 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level & BPF_LOG_LEVEL) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; - if (verifier_state_scratched(env)) print_insn_state(env, state, state->curframe); verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; env->prev_log_pos = env->log.end_pos; } @@ -19045,37 +19496,18 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; - /* check for reserved fields is already done */ - - /* check src operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - - err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); - if (err) - return err; - - src_reg_type = regs[insn->src_reg].type; - - /* check that memory (src_reg + off) is readable, - * the state of dst_reg will be updated by this func + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, - insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false, - BPF_MODE(insn->code) == BPF_MEMSX); - err = err ?: save_aux_ptr_type(env, src_reg_type, true); - err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); + err = check_load_mem(env, insn, false, is_ldsx, true, + "ldx"); if (err) return err; } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, env->insn_idx, insn); + err = check_atomic(env, insn); if (err) return err; env->insn_idx++; @@ -19087,25 +19519,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - /* check src1 operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); + err = check_store_reg(env, insn, false); if (err) return err; } else if (class == BPF_ST) { @@ -19245,6 +19659,10 @@ process_bpf_exit: return err; break; } else { + if (WARN_ON_ONCE(env->cur_state->loop_entry)) { + verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n"); + return -EFAULT; + } do_print_state = true; continue; } @@ -19504,7 +19922,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -20334,7 +20752,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; + int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; struct bpf_insn *epilogue_buf = env->epilogue_buf; struct bpf_insn *insn_buf = env->insn_buf; @@ -20363,6 +20781,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -ENOMEM; env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1); + if (ret < 0) + return ret; } } @@ -20383,6 +20805,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, insn_buf, cnt - 1); + if (ret < 0) + return ret; } } @@ -20415,7 +20841,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); @@ -20723,6 +21151,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; + func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -20939,6 +21368,14 @@ static void specialize_kfunc(struct bpf_verifier_env *env, */ env->seen_direct_write = seen_direct_write; } + + if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && + bpf_lsm_has_d_inode_locked(prog)) + *addr = (unsigned long)bpf_set_dentry_xattr_locked; + + if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] && + bpf_lsm_has_d_inode_locked(prog)) + *addr = (unsigned long)bpf_remove_dentry_xattr_locked; } static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, @@ -21373,7 +21810,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } - if (is_may_goto_insn(insn)) { + if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { + int stack_off_cnt = -stack_depth - 16; + + /* + * Two 8 byte slots, depth-16 stores the count, and + * depth-8 stores the start timestamp of the loop. + * + * The starting value of count is BPF_MAX_TIMED_LOOPS + * (0xffff). Every iteration loads it and subs it by 1, + * until the value becomes 0 in AX (thus, 1 in stack), + * after which we call arch_bpf_timed_may_goto, which + * either sets AX to 0xffff to keep looping, or to 0 + * upon timeout. AX is then stored into the stack. In + * the next iteration, we either see 0 and break out, or + * continue iterating until the next time value is 0 + * after subtraction, rinse and repeat. + */ + stack_depth_extra = 16; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); + /* + * AX is used as an argument to pass in stack_off_cnt + * (to add to r10/fp), and also as the return value of + * the call to arch_bpf_timed_may_goto. + */ + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); + insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); + cnt = 7; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } else if (is_may_goto_insn(insn)) { int stack_off = -stack_depth - 8; stack_depth_extra = 8; @@ -21897,6 +22377,13 @@ next_insn: if (subprogs[cur_subprog + 1].start == i + delta + 1) { subprogs[cur_subprog].stack_depth += stack_depth_extra; subprogs[cur_subprog].stack_extra = stack_depth_extra; + + stack_depth = subprogs[cur_subprog].stack_depth; + if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) { + verbose(env, "stack size %d(extra %d) is too large\n", + stack_depth, stack_depth_extra); + return -EINVAL; + } cur_subprog++; stack_depth = subprogs[cur_subprog].stack_depth; stack_depth_extra = 0; @@ -21907,23 +22394,33 @@ next_insn: env->prog->aux->stack_depth = subprogs[0].stack_depth; for (i = 0; i < env->subprog_cnt; i++) { + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; int subprog_start = subprogs[i].start; int stack_slots = subprogs[i].stack_extra / 8; + int slots = delta, cnt = 0; if (!stack_slots) continue; - if (stack_slots > 1) { + /* We need two slots in case timed may_goto is supported. */ + if (stack_slots > slots) { verbose(env, "verifier bug: stack_slots supports may_goto only\n"); return -EFAULT; } - /* Add ST insn to subprog prologue to init extra stack */ - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, - -subprogs[i].stack_depth, BPF_MAX_LOOPS); + stack_depth = subprogs[i].stack_depth; + if (bpf_jit_supports_timed_may_goto()) { + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_TIMED_LOOPS); + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); + } else { + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_LOOPS); + } /* Copy first actual insn to preserve it */ - insn_buf[1] = env->prog->insnsi[subprog_start]; + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; @@ -21933,7 +22430,7 @@ next_insn: * to insn after BPF_ST that inits may_goto count. * Adjustment will succeed because bpf_patch_insn_data() didn't fail. */ - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); } /* Since poke tab is now finalized, publish aux to tracker. */ @@ -22131,31 +22628,29 @@ static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) static void free_states(struct bpf_verifier_env *env) { - struct bpf_verifier_state_list *sl, *sln; + struct bpf_verifier_state_list *sl; + struct list_head *head, *pos, *tmp; int i; - sl = env->free_list; - while (sl) { - sln = sl->next; + list_for_each_safe(pos, tmp, &env->free_list) { + sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->free_list = NULL; + INIT_LIST_HEAD(&env->free_list); if (!env->explored_states) return; for (i = 0; i < state_htab_size(env); i++) { - sl = env->explored_states[i]; + head = &env->explored_states[i]; - while (sl) { - sln = sl->next; + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->explored_states[i] = NULL; + INIT_LIST_HEAD(&env->explored_states[i]); } } @@ -22163,6 +22658,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_prog_aux *aux = env->prog->aux; struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -22270,6 +22766,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, BPF_REG_1); } + /* Acquire references for struct_ops program arguments tagged with "__ref" */ + if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + for (i = 0; i < aux->ctx_arg_info_size; i++) + aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? + acquire_reference(env, 0) : 0; + } + ret = do_check(env); out: /* check for NULL is necessary, since cur_state can be freed inside @@ -22392,6 +22895,15 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, + const struct bpf_ctx_arg_aux *info, u32 cnt) +{ + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL); + prog->aux->ctx_arg_info_size = cnt; + + return prog->aux->ctx_arg_info ? 0 : -ENOMEM; +} + static int check_struct_ops_btf_id(struct bpf_verifier_env *env) { const struct btf_type *t, *func_proto; @@ -22399,10 +22911,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) const struct bpf_struct_ops *st_ops; const struct btf_member *member; struct bpf_prog *prog = env->prog; - u32 btf_id, member_idx; + bool has_refcounted_arg = false; + u32 btf_id, member_idx, member_off; struct btf *btf; const char *mname; - int err; + int i, err; if (!prog->gpl_compatible) { verbose(env, "struct ops programs must have a GPL compatible license\n"); @@ -22450,7 +22963,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } - err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8); + member_off = __btf_member_bit_offset(t, member) / 8; + err = bpf_struct_ops_supported(st_ops, member_off); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", mname, st_ops->name); @@ -22472,17 +22986,32 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EACCES; } - /* btf_ctx_access() used this to provide argument type info */ - prog->aux->ctx_arg_info = - st_ops_desc->arg_info[member_idx].info; - prog->aux->ctx_arg_info_size = - st_ops_desc->arg_info[member_idx].cnt; + for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) { + if (st_ops_desc->arg_info[member_idx].info->refcounted) { + has_refcounted_arg = true; + break; + } + } + + /* Tail call is not allowed for programs with refcounted arguments since we + * cannot guarantee that valid refcounted kptrs will be passed to the callee. + */ + for (i = 0; i < env->subprog_cnt; i++) { + if (has_refcounted_arg && env->subprog_info[i].has_tail_call) { + verbose(env, "program with __ref argument cannot tail call\n"); + return -EINVAL; + } + } + + prog->aux->st_ops = st_ops; + prog->aux->attach_st_ops_member_off = member_off; prog->aux->attach_func_proto = func_proto; prog->aux->attach_func_name = mname; env->ops = st_ops->verifier_ops; - return 0; + return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info, + st_ops_desc->arg_info[member_idx].cnt); } #define SECURITY_PREFIX "security_" @@ -22558,6 +23087,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; bool tgt_changes_pkt_data; + bool tgt_might_sleep; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22600,6 +23130,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension program changes packet data, while original does not\n"); return -EINVAL; } + + tgt_might_sleep = aux->func + ? aux->func[subprog]->aux->might_sleep + : aux->might_sleep; + if (prog->aux->might_sleep && !tgt_might_sleep) { + bpf_log(log, + "Extension program may sleep, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22856,6 +23395,33 @@ BTF_ID(func, __rcu_read_unlock) #endif BTF_SET_END(btf_id_deny) +/* fexit and fmod_ret can't be used to attach to __noreturn functions. + * Currently, we must manually list all __noreturn functions here. Once a more + * robust solution is implemented, this workaround can be removed. + */ +BTF_SET_START(noreturn_deny) +#ifdef CONFIG_IA32_EMULATION +BTF_ID(func, __ia32_sys_exit) +BTF_ID(func, __ia32_sys_exit_group) +#endif +#ifdef CONFIG_KUNIT +BTF_ID(func, __kunit_abort) +BTF_ID(func, kunit_try_catch_throw) +#endif +#ifdef CONFIG_MODULES +BTF_ID(func, __module_put_and_kthread_exit) +#endif +#ifdef CONFIG_X86_64 +BTF_ID(func, __x64_sys_exit) +BTF_ID(func, __x64_sys_exit_group) +#endif +BTF_ID(func, do_exit) +BTF_ID(func, do_group_exit) +BTF_ID(func, kthread_complete_and_exit) +BTF_ID(func, kthread_exit) +BTF_ID(func, make_task_dead) +BTF_SET_END(noreturn_deny) + static bool can_be_sleepable(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_TRACING) { @@ -22932,9 +23498,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_btf_trace = true; return 0; } else if (prog->expected_attach_type == BPF_TRACE_ITER) { - if (!bpf_iter_prog_supported(prog)) - return -EINVAL; - return 0; + return bpf_iter_prog_supported(prog); } if (prog->type == BPF_PROG_TYPE_LSM) { @@ -22944,6 +23508,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { return -EINVAL; + } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_MODIFY_RETURN) && + btf_id_set_contains(&noreturn_deny, btf_id)) { + verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n"); + return -EINVAL; } key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); @@ -23036,6 +23605,302 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } +static bool can_fallthrough(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return true; + + if (opcode == BPF_EXIT || opcode == BPF_JA) + return false; + + return true; +} + +static bool can_jump(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return false; + + switch (opcode) { + case BPF_JA: + case BPF_JEQ: + case BPF_JNE: + case BPF_JLT: + case BPF_JLE: + case BPF_JGT: + case BPF_JGE: + case BPF_JSGT: + case BPF_JSGE: + case BPF_JSLT: + case BPF_JSLE: + case BPF_JCOND: + return true; + } + + return false; +} + +static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + struct bpf_insn *insn = &prog->insnsi[idx]; + int i = 0, insn_sz; + u32 dst; + + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (can_fallthrough(insn) && idx + 1 < prog->len) + succ[i++] = idx + insn_sz; + + if (can_jump(insn)) { + dst = idx + jmp_offset(insn) + 1; + if (i == 0 || succ[0] != dst) + succ[i++] = dst; + } + + return i; +} + +/* Each field is a register bitmask */ +struct insn_live_regs { + u16 use; /* registers read by instruction */ + u16 def; /* registers written by instruction */ + u16 in; /* registers that may be alive before instruction */ + u16 out; /* registers that may be alive after instruction */ +}; + +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Compute info->{use,def} fields for the instruction */ +static void compute_insn_live_regs(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct insn_live_regs *info) +{ + struct call_summary cs; + u8 class = BPF_CLASS(insn->code); + u8 code = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u16 src = BIT(insn->src_reg); + u16 dst = BIT(insn->dst_reg); + u16 r0 = BIT(0); + u16 def = 0; + u16 use = 0xffff; + + switch (class) { + case BPF_LD: + switch (mode) { + case BPF_IMM: + if (BPF_SIZE(insn->code) == BPF_DW) { + def = dst; + use = 0; + } + break; + case BPF_LD | BPF_ABS: + case BPF_LD | BPF_IND: + /* stick with defaults */ + break; + } + break; + case BPF_LDX: + switch (mode) { + case BPF_MEM: + case BPF_MEMSX: + def = dst; + use = src; + break; + } + break; + case BPF_ST: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst; + break; + } + break; + case BPF_STX: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst | src; + break; + case BPF_ATOMIC: + switch (insn->imm) { + case BPF_CMPXCHG: + use = r0 | dst | src; + def = r0; + break; + case BPF_LOAD_ACQ: + def = dst; + use = src; + break; + case BPF_STORE_REL: + def = 0; + use = dst | src; + break; + default: + use = dst | src; + if (insn->imm & BPF_FETCH) + def = src; + else + def = 0; + } + break; + } + break; + case BPF_ALU: + case BPF_ALU64: + switch (code) { + case BPF_END: + use = dst; + def = dst; + break; + case BPF_MOV: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = 0; + else + use = src; + break; + default: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + case BPF_JMP: + case BPF_JMP32: + switch (code) { + case BPF_JA: + case BPF_JCOND: + def = 0; + use = 0; + break; + case BPF_EXIT: + def = 0; + use = r0; + break; + case BPF_CALL: + def = ALL_CALLER_SAVED_REGS; + use = def & ~BIT(BPF_REG_0); + if (get_call_summary(env, insn, &cs)) + use = GENMASK(cs.num_params, 1); + break; + default: + def = 0; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + } + + info->def = def; + info->use = use; +} + +/* Compute may-live registers after each instruction in the program. + * The register is live after the instruction I if it is read by some + * instruction S following I during program execution and is not + * overwritten between I and S. + * + * Store result in env->insn_aux_data[i].live_regs. + */ +static int compute_live_registers(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + struct insn_live_regs *state; + int insn_cnt = env->prog->len; + int err = 0, i, j; + bool changed; + + /* Use the following algorithm: + * - define the following: + * - I.use : a set of all registers read by instruction I; + * - I.def : a set of all registers written by instruction I; + * - I.in : a set of all registers that may be alive before I execution; + * - I.out : a set of all registers that may be alive after I execution; + * - insn_successors(I): a set of instructions S that might immediately + * follow I for some program execution; + * - associate separate empty sets 'I.in' and 'I.out' with each instruction; + * - visit each instruction in a postorder and update + * state[i].in, state[i].out as follows: + * + * state[i].out = U [state[s].in for S in insn_successors(i)] + * state[i].in = (state[i].out / state[i].def) U state[i].use + * + * (where U stands for set union, / stands for set difference) + * - repeat the computation while {in,out} fields changes for + * any instruction. + */ + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + if (!state) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < insn_cnt; ++i) + compute_insn_live_regs(env, &insns[i], &state[i]); + + changed = true; + while (changed) { + changed = false; + for (i = 0; i < env->cfg.cur_postorder; ++i) { + int insn_idx = env->cfg.insn_postorder[i]; + struct insn_live_regs *live = &state[insn_idx]; + int succ_num; + u32 succ[2]; + u16 new_out = 0; + u16 new_in = 0; + + succ_num = insn_successors(env->prog, insn_idx, succ); + for (int s = 0; s < succ_num; ++s) + new_out |= state[succ[s]].in; + new_in = (new_out & ~live->def) | live->use; + if (new_out != live->out || new_in != live->in) { + live->in = new_in; + live->out = new_out; + changed = true; + } + } + } + + for (i = 0; i < insn_cnt; ++i) + insn_aux[i].live_regs_before = state[i].in; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "Live regs before insn:\n"); + for (i = 0; i < insn_cnt; ++i) { + verbose(env, "%3d: ", i); + for (j = BPF_REG_0; j < BPF_REG_10; ++j) + if (insn_aux[i].live_regs_before & BIT(j)) + verbose(env, "%d", j); + else + verbose(env, "."); + verbose(env, " "); + verbose_insn(env, &insns[i]); + if (bpf_is_ldimm64(&insns[i])) + i++; + } + } + +out: + kvfree(state); + kvfree(env->cfg.insn_postorder); + env->cfg.insn_postorder = NULL; + env->cfg.cur_postorder = 0; + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -23113,12 +23978,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; env->explored_states = kvcalloc(state_htab_size(env), - sizeof(struct bpf_verifier_state_list *), + sizeof(struct list_head), GFP_USER); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; + for (i = 0; i < state_htab_size(env); i++) + INIT_LIST_HEAD(&env->explored_states[i]); + INIT_LIST_HEAD(&env->free_list); + ret = check_btf_info_early(env, attr, uattr); if (ret < 0) goto skip_full_check; @@ -23153,6 +24022,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_live_registers(env); + if (ret < 0) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -23291,6 +24164,7 @@ err_unlock: vfree(env->insn_aux_data); kvfree(env->insn_hist); err_free_env: + kvfree(env->cfg.insn_postorder); kvfree(env); return ret; } diff --git a/kernel/cfi.c b/kernel/cfi.c index 19be79639542..422fa4f958ae 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -73,14 +73,11 @@ static bool is_module_cfi_trap(unsigned long addr) struct module *mod; bool found = false; - rcu_read_lock_sched_notrace(); - + guard(rcu)(); mod = __module_address(addr); if (mod) found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end); - rcu_read_unlock_sched_notrace(); - return found; } #else /* CONFIG_MODULES */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 11ea8d24ac72..fa24c032ed6f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -851,7 +851,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (kn->parent != new_parent) + if (rcu_access_pointer(kn->__parent) != new_parent) return -EIO; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f231fe3a0744..ac2db99941ca 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -633,9 +633,22 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } +static struct cgroup *kn_priv(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + /* + * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. + * Therefore it is always safe to dereference this pointer outside of a + * RCU section. + */ + parent = rcu_dereference_check(kn->__parent, + kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); + return parent->priv; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); /* @@ -1612,7 +1625,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); cgroup_unlock(); @@ -1644,7 +1657,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); /* * We're gonna grab cgroup_mutex which nests outside kernfs @@ -2118,7 +2131,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | - KERNFS_ROOT_SUPPORT_USER_XATTR, + KERNFS_ROOT_SUPPORT_USER_XATTR | + KERNFS_ROOT_INVARIANT_PARENT, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -4115,7 +4129,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 20552f163930..8aafd050b754 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -73,7 +73,6 @@ CONFIG_DEBUG_VM=y CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_VM_RB=y CONFIG_DEBUG_VM_VMACACHE=y -CONFIG_GENERIC_PTDUMP=y CONFIG_KASAN=y CONFIG_KASAN_GENERIC=y CONFIG_KASAN_INLINE=y diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index a620fb4b2116..aff7c0fdbefa 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -375,11 +375,10 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) return 0; } -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) +void __init reserve_crashkernel_generic(unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) { unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; bool fixed_base = false; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index ce1bb2301c06..0b9495187fba 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -837,10 +837,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; - int ret = 0; - - if (arch_kgdb_ops.enable_nmi) - arch_kgdb_ops.enable_nmi(0); /* * Avoid entering the debugger if we were triggered due to an oops * but panic_timeout indicates the system should automatically @@ -858,15 +854,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) ks->linux_regs = regs; if (kgdb_reenter_check(ks)) - goto out; /* Ouch, double exception ! */ + return 0; /* Ouch, double exception ! */ if (kgdb_info[ks->cpu].enter_kgdb != 0) - goto out; + return 0; - ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); -out: - if (arch_kgdb_ops.enable_nmi) - arch_kgdb_ops.enable_nmi(1); - return ret; + return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); } NOKPROBE_SYMBOL(kgdb_handle_exception); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a77f1c779c4..9b11b10b120c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -334,7 +334,7 @@ poll_again: *cp = '\0'; p_tmp = strrchr(buffer, ' '); p_tmp = (p_tmp ? p_tmp + 1 : buffer); - strscpy(tmpbuffer, p_tmp, sizeof(tmpbuffer)); + strscpy(tmpbuffer, p_tmp); *cp = tmp; len = strlen(tmpbuffer); @@ -452,7 +452,7 @@ poll_again: char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) - strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); + strscpy(kdb_prompt_str, prompt); kdb_printf("%s", kdb_prompt_str); kdb_nextline = 1; /* Prompt and input resets line number */ return kdb_read(buffer, bufsize); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5f4be507d79f..7a4d2d4689a5 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -25,7 +25,6 @@ #include <linux/smp.h> #include <linux/utsname.h> #include <linux/vmalloc.h> -#include <linux/atomic.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> @@ -105,7 +104,7 @@ static kdbmsg_t kdbmsgs[] = { KDBMSG(NOENVVALUE, "Environment variable should have value"), KDBMSG(NOTIMP, "Command not implemented"), KDBMSG(ENVFULL, "Environment full"), - KDBMSG(ENVBUFFULL, "Environment buffer full"), + KDBMSG(KMALLOCFAILED, "Failed to allocate memory"), KDBMSG(TOOMANYBPT, "Too many breakpoints defined"), #ifdef CONFIG_CPU_XSCALE KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"), @@ -130,13 +129,9 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs); /* - * Initial environment. This is all kept static and local to - * this file. We don't want to rely on the memory allocation - * mechanisms in the kernel, so we use a very limited allocate-only - * heap for new and altered environment variables. The entire - * environment is limited to a fixed number of entries (add more - * to __env[] if required) and a fixed amount of heap (add more to - * KDB_ENVBUFSIZE if required). + * Initial environment. This is all kept static and local to this file. + * The entire environment is limited to a fixed number of entries + * (add more to __env[] if required) */ static char *__env[31] = { @@ -259,35 +254,6 @@ char *kdbgetenv(const char *match) } /* - * kdballocenv - This function is used to allocate bytes for - * environment entries. - * Parameters: - * bytes The number of bytes to allocate in the static buffer. - * Returns: - * A pointer to the allocated space in the buffer on success. - * NULL if bytes > size available in the envbuffer. - * Remarks: - * We use a static environment buffer (envbuffer) to hold the values - * of dynamically generated environment variables (see kdb_set). Buffer - * space once allocated is never free'd, so over time, the amount of space - * (currently 512 bytes) will be exhausted if env variables are changed - * frequently. - */ -static char *kdballocenv(size_t bytes) -{ -#define KDB_ENVBUFSIZE 512 - static char envbuffer[KDB_ENVBUFSIZE]; - static int envbufsize; - char *ep = NULL; - - if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) { - ep = &envbuffer[envbufsize]; - envbufsize += bytes; - } - return ep; -} - -/* * kdbgetulenv - This function will return the value of an unsigned * long-valued environment variable. * Parameters: @@ -348,9 +314,9 @@ static int kdb_setenv(const char *var, const char *val) varlen = strlen(var); vallen = strlen(val); - ep = kdballocenv(varlen + vallen + 2); - if (ep == (char *)0) - return KDB_ENVBUFFULL; + ep = kmalloc(varlen + vallen + 2, GFP_KDB); + if (!ep) + return KDB_KMALLOCFAILED; sprintf(ep, "%s=%s", var, val); @@ -359,6 +325,7 @@ static int kdb_setenv(const char *var, const char *val) && ((strncmp(__env[i], var, varlen) == 0) && ((__env[i][varlen] == '\0') || (__env[i][varlen] == '=')))) { + kfree_const(__env[i]); __env[i] = ep; return 0; } @@ -2119,32 +2086,6 @@ static int kdb_dmesg(int argc, const char **argv) return 0; } #endif /* CONFIG_PRINTK */ - -/* Make sure we balance enable/disable calls, must disable first. */ -static atomic_t kdb_nmi_disabled; - -static int kdb_disable_nmi(int argc, const char *argv[]) -{ - if (atomic_read(&kdb_nmi_disabled)) - return 0; - atomic_set(&kdb_nmi_disabled, 1); - arch_kgdb_ops.enable_nmi(0); - return 0; -} - -static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) -{ - if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) - return -EINVAL; - arch_kgdb_ops.enable_nmi(1); - return 0; -} - -static const struct kernel_param_ops kdb_param_ops_enable_nmi = { - .set = kdb_param_enable_nmi, -}; -module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); - /* * kdb_cpu - This function implements the 'cpu' command. * cpu [<cpunum>] @@ -2836,20 +2777,10 @@ static kdbtab_t maintab[] = { }, }; -static kdbtab_t nmicmd = { - .name = "disable_nmi", - .func = kdb_disable_nmi, - .usage = "", - .help = "Disable NMI entry to KDB", - .flags = KDB_ENABLE_ALWAYS_SAFE, -}; - /* Initialize the kdb command table. */ static void __init kdb_inittab(void) { kdb_register_table(maintab, ARRAY_SIZE(maintab)); - if (arch_kgdb_ops.enable_nmi) - kdb_register_table(&nmicmd, 1); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 70c84b9d7be3..615b4e6d22c7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,6 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; + pte_t pte; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); @@ -192,6 +193,16 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (!page_vma_mapped_walk(&pvmw)) goto unlock; VM_BUG_ON_PAGE(addr != pvmw.address, old_page); + pte = ptep_get(pvmw.pte); + + /* + * Handle PFN swap PTES, such as device-exclusive ones, that actually + * map pages: simply trigger GUP again to fix it up. + */ + if (unlikely(!pte_present(pte))) { + page_vma_mapped_walk_done(&pvmw); + goto unlock; + } if (new_page) { folio_get(new_folio); @@ -206,7 +217,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); + flush_cache_page(vma, addr, pte_pfn(pte)); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) set_pte_at(mm, addr, pvmw.pte, @@ -1692,7 +1703,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| + VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/kernel/exit.c b/kernel/exit.c index c2e6c7b7779f..1b51dc099f1e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -268,6 +268,9 @@ repeat: leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + /* for pidfs_exit() and do_notify_parent() */ + if (leader->signal->flags & SIGNAL_GROUP_EXIT) + leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -756,12 +759,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; - /* - * Ignore thread-group leaders that exited before all - * subthreads did. - */ - if (!delay_group_leader(tsk)) - do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && @@ -774,6 +771,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; + /* untraced sub-thread */ + do_notify_pidfd(tsk); } if (autoreap) { diff --git a/kernel/fork.c b/kernel/fork.c index a61a4407ebdf..c4b26cd8998b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -311,11 +311,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * so memcg accounting is performed manually on assigning/releasing * stacks to tasks. Drop __GFP_ACCOUNT. */ - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, + stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP & ~__GFP_ACCOUNT, - PAGE_KERNEL, - 0, node, __builtin_return_address(0)); + node, __builtin_return_address(0)); if (!stack) return -ENOMEM; @@ -436,35 +434,6 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -#ifdef CONFIG_PER_VMA_LOCK - -/* SLAB cache for vm_area_struct.lock */ -static struct kmem_cache *vma_lock_cachep; - -static bool vma_lock_alloc(struct vm_area_struct *vma) -{ - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = UINT_MAX; - - return true; -} - -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - kmem_cache_free(vma_lock_cachep, vma->vm_lock); -} - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } -static inline void vma_lock_free(struct vm_area_struct *vma) {} - -#endif /* CONFIG_PER_VMA_LOCK */ - struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -474,14 +443,46 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - kmem_cache_free(vm_area_cachep, vma); - return NULL; - } return vma; } +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +} + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); @@ -491,51 +492,28 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - if (!vma_lock_alloc(new)) { - kmem_cache_free(vm_area_cachep, new); - return NULL; - } + vm_area_init_from(orig, new); + vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(new->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(new); + return new; } -void __vm_area_free(struct vm_area_struct *vma) +void vm_area_free(struct vm_area_struct *vma) { + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); vma_numab_state_free(vma); free_anon_vma_name(vma); - vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); } -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); - __vm_area_free(vma); -} -#endif - -void vm_area_free(struct vm_area_struct *vma) -{ -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -826,6 +804,36 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ +#ifdef CONFIG_MM_ID +static DEFINE_IDA(mm_ida); + +static inline int mm_alloc_id(struct mm_struct *mm) +{ + int ret; + + ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL); + if (ret < 0) + return ret; + mm->mm_id = ret; + return 0; +} + +static inline void mm_free_id(struct mm_struct *mm) +{ + const mm_id_t id = mm->mm_id; + + mm->mm_id = MM_ID_DUMMY; + if (id == MM_ID_DUMMY) + return; + if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX)) + return; + ida_free(&mm_ida, id); +} +#else /* !CONFIG_MM_ID */ +static inline int mm_alloc_id(struct mm_struct *mm) { return 0; } +static inline void mm_free_id(struct mm_struct *mm) {} +#endif /* CONFIG_MM_ID */ + static void check_mm(struct mm_struct *mm) { int i; @@ -929,6 +937,7 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); + mm_free_id(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); @@ -1263,6 +1272,15 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +#ifdef CONFIG_PER_VMA_LOCK + rcuwait_init(&mm->vma_writer_wait); +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1304,6 +1322,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_pgd(mm)) goto fail_nopgd; + if (mm_alloc_id(mm)) + goto fail_noid; + if (init_new_context(p, mm)) goto fail_nocontext; @@ -1323,6 +1344,8 @@ fail_pcpu: fail_cid: destroy_context(mm); fail_nocontext: + mm_free_id(mm); +fail_noid: mm_free_pgd(mm); fail_nopgd: free_mm(mm); @@ -1559,6 +1582,17 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode) +{ + if (mm == current->mm) + return true; + if (ptrace_may_access(task, mode)) + return true; + if ((mode & PTRACE_MODE_READ) && perfmon_capable()) + return true; + return false; +} + struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; @@ -1571,7 +1605,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (!mm) { mm = ERR_PTR(-ESRCH); - } else if (mm != current->mm && !ptrace_may_access(task, mode)) { + } else if (!may_access_mm(mm, task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -3179,6 +3213,11 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3195,11 +3234,10 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); -#ifdef CONFIG_PER_VMA_LOCK - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); -#endif + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 04efa7a6e69b..dc898ec93463 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -93,6 +93,43 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static void debug_show_blocker(struct task_struct *task) +{ + struct task_struct *g, *t; + unsigned long owner; + struct mutex *lock; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); + + lock = READ_ONCE(task->blocker_mutex); + if (!lock) + return; + + owner = mutex_get_owner(lock); + if (unlikely(!owner)) { + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", + task->comm, task->pid); + return; + } + + /* Ensure the owner information is correct. */ + for_each_process_thread(g, t) { + if ((unsigned long)t == owner) { + pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", + task->comm, task->pid, t->comm, t->pid); + sched_show_task(t); + return; + } + } +} +#else +static inline void debug_show_blocker(struct task_struct *task) +{ +} +#endif + static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -152,6 +189,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); + debug_show_blocker(t); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 93a822d3c468..7cb19e601426 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -653,13 +653,12 @@ static int __jump_label_mod_text_reserved(void *start, void *end) struct module *mod; int ret; - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - + scoped_guard(rcu) { + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + } if (!mod) return 0; @@ -746,9 +745,9 @@ static int jump_label_add_module(struct module *mod) kfree(jlm); return -ENOMEM; } - preempt_disable(); - jlm2->mod = __module_address((unsigned long)key); - preempt_enable(); + scoped_guard(rcu) + jlm2->mod = __module_address((unsigned long)key); + jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); @@ -906,13 +905,13 @@ static void jump_label_update(struct static_key *key) return; } - preempt_disable(); - mod = __module_address((unsigned long)key); - if (mod) { - stop = mod->jump_entries + mod->num_jump_entries; - init = mod->state == MODULE_STATE_COMING; + scoped_guard(rcu) { + mod = __module_address((unsigned long)key); + if (mod) { + stop = mod->jump_entries + mod->num_jump_entries; + init = mod->state == MODULE_STATE_COMING; + } } - preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c22ad51c4317..3e62b944c883 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -210,6 +210,16 @@ int sanity_check_segment_list(struct kimage *image) } #endif + /* + * The destination addresses are searched from system RAM rather than + * being allocated from the buddy allocator, so they are not guaranteed + * to be accepted by the current kernel. Accept the destination + * addresses before kexec swaps their content with the segments' source + * pages to avoid accessing memory before it is accepted. + */ + for (i = 0; i < nr_segments; i++) + accept_memory(image->segment[i].mem, image->segment[i].memsz); + return 0; } diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index d3689632e8b9..3a5c25b2adc9 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -390,7 +390,7 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, struct kexec_buf *kbuf, unsigned long *lowest_load_addr) { - unsigned long lowest_addr = UINT_MAX; + unsigned long lowest_addr = ULONG_MAX; int ret; size_t i; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3eedb8c226ad..fba686487e3b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -464,6 +464,12 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, continue; } + /* Make sure this does not conflict with exclude range */ + if (arch_check_excluded_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + /* We found a suitable memory range */ break; } while (1); @@ -498,6 +504,12 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, continue; } + /* Make sure this does not conflict with exclude range */ + if (arch_check_excluded_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + /* We found a suitable memory range */ break; } while (1); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 88aeac84e4c0..ffe0c3d52306 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1547,7 +1547,7 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure the address is in a text area, and find a module if exists. */ *probed_mod = NULL; if (!core_kernel_text((unsigned long) p->addr)) { - guard(preempt)(); + guard(rcu)(); *probed_mod = __module_text_address((unsigned long) p->addr); if (!(*probed_mod)) return -EINVAL; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 4a0fb7978d0d..0e73fac55f8e 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -59,7 +59,7 @@ static void klp_find_object_module(struct klp_object *obj) if (!klp_is_module(obj)) return; - rcu_read_lock_sched(); + guard(rcu)(); /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by @@ -75,8 +75,6 @@ static void klp_find_object_module(struct klp_object *obj) */ if (mod && mod->klp_alive) obj->mod = mod; - - rcu_read_unlock_sched(); } static bool klp_initialized(void) diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 9ef9850aeebe..4e36258cc34f 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -50,6 +50,11 @@ LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ #endif /* CONFIG_QUEUED_SPINLOCKS */ /* + * Locking events for Resilient Queued Spin Lock + */ +LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ + +/* * Locking events for rwsem */ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b15757e63626..58d78a33ac65 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6264,6 +6264,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) hlist_del_rcu(&class->hash_entry); WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); + /* Class allocated but not used, -1 in nr_unused_locks */ + if (class->usage_mask == 0) + debug_atomic_dec(nr_unused_locks); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); if (class - lock_classes == max_lock_class_idx) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index cc33470f4de9..ce0362f0a871 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -362,6 +362,60 @@ static struct lock_torture_ops raw_spin_lock_irq_ops = { .name = "raw_spin_lock_irq" }; +#ifdef CONFIG_BPF_SYSCALL + +#include <asm/rqspinlock.h> +static rqspinlock_t rqspinlock; + +static int torture_raw_res_spin_write_lock(int tid __maybe_unused) +{ + raw_res_spin_lock(&rqspinlock); + return 0; +} + +static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) +{ + raw_res_spin_unlock(&rqspinlock); +} + +static struct lock_torture_ops raw_res_spin_lock_ops = { + .writelock = torture_raw_res_spin_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock" +}; + +static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) +{ + unsigned long flags; + + raw_res_spin_lock_irqsave(&rqspinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) +{ + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_res_spin_lock_irq_ops = { + .writelock = torture_raw_res_spin_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock_irq" +}; + +#endif + static DEFINE_RWLOCK(torture_rwlock); static int torture_rwlock_write_lock(int tid __maybe_unused) @@ -1168,6 +1222,9 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, &raw_spin_lock_ops, &raw_spin_lock_irq_ops, +#ifdef CONFIG_BPF_SYSCALL + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, +#endif &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 85251d8771d9..5c92ba199b90 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -15,12 +15,6 @@ #include <asm/mcs_spinlock.h> -struct mcs_spinlock { - struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ - int count; /* nesting count, see qspinlock.c */ -}; - #ifndef arch_mcs_spin_lock_contended /* * Using smp_cond_load_acquire() provides the acquire semantics @@ -30,9 +24,7 @@ struct mcs_spinlock { * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ -do { \ - smp_cond_load_acquire(l, VAL); \ -} while (0) + smp_cond_load_acquire(l, VAL) #endif #ifndef arch_mcs_spin_unlock_contended diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 19b636f60a24..555e2b3a665a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -72,6 +72,14 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } +/* Do not use the return value as a pointer directly. */ +unsigned long mutex_get_owner(struct mutex *lock) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + return (unsigned long)__owner_task(owner); +} + /* * Returns: __mutex_owner(lock) on failure or NULL on success. */ @@ -182,6 +190,9 @@ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + WRITE_ONCE(current->blocker_mutex, lock); +#endif debug_mutex_add_waiter(lock, waiter, current); list_add_tail(&waiter->list, list); @@ -197,6 +208,9 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_remove_waiter(lock, waiter, current); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + WRITE_ONCE(current->blocker_mutex, NULL); +#endif } /* diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 6083883c4fe0..d6964fc29f51 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ - typeof(var) __sum = 0; \ + TYPEOF_UNQUAL(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 7d96bed718e4..af8d122bb649 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -25,8 +25,9 @@ #include <trace/events/lock.h> /* - * Include queued spinlock statistics code + * Include queued spinlock definitions and statistics code */ +#include "qspinlock.h" #include "qspinlock_stat.h" /* @@ -67,36 +68,6 @@ */ #include "mcs_spinlock.h" -#define MAX_NODES 4 - -/* - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in - * size and four of them will fit nicely in one 64-byte cacheline. For - * pvqspinlock, however, we need more space for extra data. To accommodate - * that, we insert two more long words to pad it up to 32 bytes. IOW, only - * two of them can fit in a cacheline in this case. That is OK as it is rare - * to have more than 2 levels of slowpath nesting in actual use. We don't - * want to penalize pvqspinlocks to optimize for a rare case in native - * qspinlocks. - */ -struct qnode { - struct mcs_spinlock mcs; -#ifdef CONFIG_PARAVIRT_SPINLOCKS - long reserved[2]; -#endif -}; - -/* - * The pending bit spinning loop count. - * This heuristic is used to limit the number of lockword accesses - * made by atomic_cond_read_relaxed when waiting for the lock to - * transition out of the "== _Q_PENDING_VAL" state. We don't spin - * indefinitely because there's no guarantee that we'll make forward - * progress. - */ -#ifndef _Q_PENDING_LOOPS -#define _Q_PENDING_LOOPS 1 -#endif /* * Per-CPU queue node structures; we can never have more than 4 nested @@ -106,161 +77,7 @@ struct qnode { * * PV doubles the storage and uses the second cacheline for PV state. */ -static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); - -/* - * We must be able to distinguish between no-tail and the tail at 0:0, - * therefore increment the cpu number by one. - */ - -static inline __pure u32 encode_tail(int cpu, int idx) -{ - u32 tail; - - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ - - return tail; -} - -static inline __pure struct mcs_spinlock *decode_tail(u32 tail) -{ - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - - return per_cpu_ptr(&qnodes[idx].mcs, cpu); -} - -static inline __pure -struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) -{ - return &((struct qnode *)base + idx)->mcs; -} - -#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) - -#if _Q_PENDING_BITS == 8 -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - WRITE_ONCE(lock->pending, 0); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - * - * Lock stealing is not allowed if this function is used. - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); -} - -/* - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail), which heads an address dependency - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - /* - * We can use relaxed semantics since the caller ensures that the - * MCS node is properly initialized before updating the tail. - */ - return (u32)xchg_relaxed(&lock->tail, - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; -} - -#else /* _Q_PENDING_BITS == 8 */ - -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); -} - -/** - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = (old & _Q_LOCKED_PENDING_MASK) | tail; - /* - * We can use relaxed semantics since the caller ensures that - * the MCS node is properly initialized before updating the - * tail. - */ - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - - return old; -} -#endif /* _Q_PENDING_BITS == 8 */ - -/** - * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending - * @lock : Pointer to queued spinlock structure - * Return: The previous lock value - * - * *,*,* -> *,1,* - */ -#ifndef queued_fetch_set_pending_acquire -static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) -{ - return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); -} -#endif - -/** - * set_locked - Set the lock bit and own the lock - * @lock: Pointer to queued spinlock structure - * - * *,*,0 -> *,0,1 - */ -static __always_inline void set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); -} - +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); /* * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for @@ -410,7 +227,7 @@ pv_queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES)) { lockevent_inc(lock_no_node); while (!queued_spin_trylock(lock)) cpu_relax(); @@ -465,7 +282,7 @@ pv_queue: * head of the waitqueue. */ if (old & _Q_TAIL_MASK) { - prev = decode_tail(old); + prev = decode_tail(old, qnodes); /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h new file mode 100644 index 000000000000..d69958a844f7 --- /dev/null +++ b/kernel/locking/qspinlock.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Queued spinlock defines + * + * This file contains macro definitions and functions shared between different + * qspinlock slow path implementations. + */ +#ifndef __LINUX_QSPINLOCK_H +#define __LINUX_QSPINLOCK_H + +#include <asm-generic/percpu.h> +#include <linux/percpu-defs.h> +#include <asm-generic/qspinlock.h> +#include <asm-generic/mcs_spinlock.h> + +#define _Q_MAX_NODES 4 + +/* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in + * size and four of them will fit nicely in one 64-byte cacheline. For + * pvqspinlock, however, we need more space for extra data. To accommodate + * that, we insert two more long words to pad it up to 32 bytes. IOW, only + * two of them can fit in a cacheline in this case. That is OK as it is rare + * to have more than 2 levels of slowpath nesting in actual use. We don't + * want to penalize pvqspinlocks to optimize for a rare case in native + * qspinlocks. + */ +struct qnode { + struct mcs_spinlock mcs; +#ifdef CONFIG_PARAVIRT_SPINLOCKS + long reserved[2]; +#endif +}; + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline __pure u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline __pure struct mcs_spinlock *decode_tail(u32 tail, + struct qnode __percpu *qnodes) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&qnodes[idx].mcs, cpu); +} + +static inline __pure +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) +{ + return &((struct qnode *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail), which heads an address dependency + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + /* + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. + */ + return (u32)xchg_relaxed(&lock->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); +} +#endif + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +#endif /* __LINUX_QSPINLOCK_H */ diff --git a/kernel/module/internal.h b/kernel/module/internal.h index d09b46ef032f..626cf8668a7e 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -124,17 +124,6 @@ char *module_next_tag_pair(char *string, unsigned long *secsize); #define for_each_modinfo_entry(entry, info, name) \ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry)) -static inline void module_assert_mutex_or_preempt(void) -{ -#ifdef CONFIG_LOCKDEP - if (unlikely(!debug_locks)) - return; - - WARN_ON_ONCE(!rcu_read_lock_sched_held() && - !lockdep_is_held(&module_mutex)); -#endif -} - static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index bf65e0c3c86f..00a60796327c 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -177,19 +177,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info) unsigned long strtab_size; void *data_base = mod->mem[MOD_DATA].base; void *init_data_base = mod->mem[MOD_INIT_DATA].base; + struct mod_kallsyms *kallsyms; - /* Set up to point into init section. */ - mod->kallsyms = (void __rcu *)init_data_base + - info->mod_kallsyms_init_off; + kallsyms = init_data_base + info->mod_kallsyms_init_off; - rcu_read_lock(); - /* The following is safe since this pointer cannot change */ - rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr; - rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + kallsyms->symtab = (void *)symsec->sh_addr; + kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - rcu_dereference(mod->kallsyms)->strtab = - (void *)info->sechdrs[info->index.str].sh_addr; - rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs; + kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + kallsyms->typetab = init_data_base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init @@ -199,20 +195,19 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.strtab = s = data_base + info->stroffs; mod->core_kallsyms.typetab = data_base + info->core_typeoffs; strtab_size = info->core_typeoffs - info->stroffs; - src = rcu_dereference(mod->kallsyms)->symtab; - for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { - rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info); + src = kallsyms->symtab; + for (ndst = i = 0; i < kallsyms->num_symtab; i++) { + kallsyms->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { ssize_t ret; mod->core_kallsyms.typetab[ndst] = - rcu_dereference(mod->kallsyms)->typetab[i]; + kallsyms->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - ret = strscpy(s, - &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name], + ret = strscpy(s, &kallsyms->strtab[src[i].st_name], strtab_size); if (ret < 0) break; @@ -220,7 +215,9 @@ void add_kallsyms(struct module *mod, const struct load_info *info) strtab_size -= ret + 1; } } - rcu_read_unlock(); + + /* Set up to point into init section. */ + rcu_assign_pointer(mod->kallsyms, kallsyms); mod->core_kallsyms.num_symtab = ndst; } @@ -260,7 +257,7 @@ static const char *find_kallsyms_symbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval, bestval; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms); struct module_memory *mod_mem; /* At worse, next value is at end of module */ @@ -319,7 +316,7 @@ void * __weak dereference_module_function_descriptor(struct module *mod, /* * For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. + * not to lock to avoid deadlock on oopses, RCU is enough. */ int module_address_lookup(unsigned long addr, unsigned long *size, @@ -332,7 +329,7 @@ int module_address_lookup(unsigned long addr, int ret = 0; struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address(addr); if (mod) { if (modname) @@ -350,8 +347,6 @@ int module_address_lookup(unsigned long addr, if (sym) ret = strscpy(namebuf, sym, KSYM_NAME_LEN); } - preempt_enable(); - return ret; } @@ -359,7 +354,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) { struct module *mod; - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -371,12 +366,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) goto out; strscpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); return 0; } } out: - preempt_enable(); return -ERANGE; } @@ -385,13 +378,13 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, { struct module *mod; - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { struct mod_kallsyms *kallsyms; if (mod->state == MODULE_STATE_UNFORMED) continue; - kallsyms = rcu_dereference_sched(mod->kallsyms); + kallsyms = rcu_dereference(mod->kallsyms); if (symnum < kallsyms->num_symtab) { const Elf_Sym *sym = &kallsyms->symtab[symnum]; @@ -400,12 +393,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strscpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); - preempt_enable(); return 0; } symnum -= kallsyms->num_symtab; } - preempt_enable(); return -ERANGE; } @@ -413,7 +404,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name) { unsigned int i; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms); for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; @@ -453,23 +444,15 @@ static unsigned long __module_kallsyms_lookup_name(const char *name) /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name) { - unsigned long ret; - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - ret = __module_kallsyms_lookup_name(name); - preempt_enable(); - return ret; + guard(rcu)(); + return __module_kallsyms_lookup_name(name); } unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { - unsigned long ret; - - preempt_disable(); - ret = __find_kallsyms_symbol_value(mod, name); - preempt_enable(); - return ret; + guard(rcu)(); + return __find_kallsyms_symbol_value(mod, name); } int module_kallsyms_on_each_symbol(const char *modname, @@ -490,10 +473,8 @@ int module_kallsyms_on_each_symbol(const char *modname, if (modname && strcmp(modname, mod->name)) continue; - /* Use rcu_dereference_sched() to remain compliant with the sparse tool */ - preempt_disable(); - kallsyms = rcu_dereference_sched(mod->kallsyms); - preempt_enable(); + kallsyms = rcu_dereference_check(mod->kallsyms, + lockdep_is_held(&module_mutex)); for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; diff --git a/kernel/module/main.c b/kernel/module/main.c index a256cc919ad7..a2859dc3eea6 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -67,7 +67,7 @@ /* * Mutex protects: - * 1) List of modules (also safely readable with preempt_disable), + * 1) List of modules (also safely readable within RCU read section), * 2) module_use links, * 3) mod_tree.addr_min/mod_tree.addr_max. * (delete and add uses RCU list operations). @@ -331,7 +331,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, /* * Find an exported symbol and return it, along with, (optional) crc and - * (optional) module which owns it. Needs preempt disabled or module_mutex. + * (optional) module which owns it. Needs RCU or module_mutex. */ bool find_symbol(struct find_symbol_arg *fsa) { @@ -345,8 +345,6 @@ bool find_symbol(struct find_symbol_arg *fsa) struct module *mod; unsigned int i; - module_assert_mutex_or_preempt(); - for (i = 0; i < ARRAY_SIZE(arr); i++) if (find_exported_symbol_in_section(&arr[i], NULL, fsa)) return true; @@ -374,16 +372,14 @@ bool find_symbol(struct find_symbol_arg *fsa) } /* - * Search for module by name: must hold module_mutex (or preempt disabled - * for read-only access). + * Search for module by name: must hold module_mutex (or RCU for read-only + * access). */ struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; - module_assert_mutex_or_preempt(); - list_for_each_entry_rcu(mod, &modules, list, lockdep_is_held(&module_mutex)) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) @@ -454,8 +450,7 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) struct module *mod; unsigned int cpu; - preempt_disable(); - + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -472,13 +467,10 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) per_cpu_ptr(mod->percpu, get_boot_cpu_id()); } - preempt_enable(); return true; } } } - - preempt_enable(); return false; } @@ -795,8 +787,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, async_synchronize_full(); /* Store the name and taints of the last unloaded module for diagnostic purposes */ - strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name)); - strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints)); + strscpy(last_unloaded_module.name, mod->name); + strscpy(last_unloaded_module.taints, module_flags(mod, buf, false)); free_module(mod); /* someone could wait for the module in add_unformed_module() */ @@ -814,10 +806,9 @@ void __symbol_put(const char *symbol) .gplok = true, }; - preempt_disable(); + guard(rcu)(); BUG_ON(!find_symbol(&fsa)); module_put(fsa.owner); - preempt_enable(); } EXPORT_SYMBOL(__symbol_put); @@ -832,13 +823,12 @@ void symbol_put_addr(void *addr) /* * Even though we hold a reference on the module; we still need to - * disable preemption in order to safely traverse the data structure. + * RCU read section in order to safely traverse the data structure. */ - preempt_disable(); + guard(rcu)(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); - preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); @@ -1189,7 +1179,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, getname: /* We must make copy under the lock if we failed to get ref. */ - strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); + strscpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); unlock: mutex_unlock(&module_mutex); return fsa.sym; @@ -1341,7 +1331,7 @@ static void free_module(struct module *mod) mod_tree_remove(mod); /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); - /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ + /* Wait for RCU synchronizing before releasing mod->list and buglist. */ synchronize_rcu(); if (try_add_tainted_module(mod)) pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n", @@ -1364,21 +1354,18 @@ void *__symbol_get(const char *symbol) .warn = true, }; - preempt_disable(); - if (!find_symbol(&fsa)) - goto fail; - if (fsa.license != GPL_ONLY) { - pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", - symbol); - goto fail; + scoped_guard(rcu) { + if (!find_symbol(&fsa)) + return NULL; + if (fsa.license != GPL_ONLY) { + pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", + symbol); + return NULL; + } + if (strong_try_module_get(fsa.owner)) + return NULL; } - if (strong_try_module_get(fsa.owner)) - goto fail; - preempt_enable(); return (void *)kernel_symbol_value(fsa.sym); -fail: - preempt_enable(); - return NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -3013,7 +3000,7 @@ static noinline int do_init_module(struct module *mod) #endif /* * We want to free module_init, but be aware that kallsyms may be - * walking this with preempt disabled. In all the failure paths, we + * walking this within an RCU read section. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success * path. execmem_free() cannot be called in an interrupt, so do the * work and call synchronize_rcu() in a work queue. @@ -3680,28 +3667,23 @@ out: /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { - const struct exception_table_entry *e = NULL; struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address(addr); if (!mod) - goto out; + return NULL; if (!mod->num_exentries) - goto out; - - e = search_extable(mod->extable, - mod->num_exentries, - addr); -out: - preempt_enable(); - + return NULL; /* - * Now, if we found one, we are running inside it now, hence - * we cannot unload the module, hence no refcnt needed. + * The address passed here belongs to a module that is currently + * invoked (we are running inside it). Therefore its module::refcnt + * needs already be >0 to ensure that it is not removed at this stage. + * All other user need to invoke this function within a RCU read + * section. */ - return e; + return search_extable(mod->extable, mod->num_exentries, addr); } /** @@ -3713,20 +3695,15 @@ out: */ bool is_module_address(unsigned long addr) { - bool ret; - - preempt_disable(); - ret = __module_address(addr) != NULL; - preempt_enable(); - - return ret; + guard(rcu)(); + return __module_address(addr) != NULL; } /** * __module_address() - get the module which contains an address. * @addr: the address. * - * Must be called with preempt disabled or module mutex held so that + * Must be called within RCU read section or module mutex held so that * module doesn't get freed during this. */ struct module *__module_address(unsigned long addr) @@ -3744,8 +3721,6 @@ struct module *__module_address(unsigned long addr) return NULL; lookup: - module_assert_mutex_or_preempt(); - mod = mod_find(addr, &mod_tree); if (mod) { BUG_ON(!within_module(addr, mod)); @@ -3765,20 +3740,28 @@ lookup: */ bool is_module_text_address(unsigned long addr) { - bool ret; + guard(rcu)(); + return __module_text_address(addr) != NULL; +} - preempt_disable(); - ret = __module_text_address(addr) != NULL; - preempt_enable(); +void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data) +{ + struct module *mod; - return ret; + guard(rcu)(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (func(mod, data)) + break; + } } /** * __module_text_address() - get the module whose code contains an address. * @addr: the address. * - * Must be called with preempt disabled or module mutex held so that + * Must be called within RCU read section or module mutex held so that * module doesn't get freed during this. */ struct module *__module_text_address(unsigned long addr) @@ -3801,7 +3784,7 @@ void print_modules(void) printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -3809,7 +3792,6 @@ void print_modules(void) } print_unloaded_tainted_modules(); - preempt_enable(); if (last_unloaded_module.name[0]) pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name, last_unloaded_module.taints); diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c index 16742d1c630c..4fefec5b683c 100644 --- a/kernel/module/tracking.c +++ b/kernel/module/tracking.c @@ -21,8 +21,6 @@ int try_add_tainted_module(struct module *mod) { struct mod_unload_taint *mod_taint; - module_assert_mutex_or_preempt(); - if (!mod->taints) goto out; diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index 277197977d43..d3204c5c74eb 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -12,11 +12,11 @@ /* * Use a latched RB-tree for __module_address(); this allows us to use - * RCU-sched lookups of the address from any context. + * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING because those can really hit - * __module_address() hard by doing a lot of stack unwinding; potentially from - * NMI context. + * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * really hit __module_address() hard by doing a lot of stack unwinding; + * potentially from NMI context. */ static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) diff --git a/kernel/module/version.c b/kernel/module/version.c index 3718a8868321..2beefeba82d9 100644 --- a/kernel/module/version.c +++ b/kernel/module/version.c @@ -79,17 +79,17 @@ int check_modstruct_version(const struct load_info *info, .name = "module_layout", .gplok = true, }; + bool have_symbol; /* * Since this should be found in kernel (which can't be removed), no - * locking is necessary -- use preempt_disable() to placate lockdep. + * locking is necessary. Regardless use a RCU read section to keep + * lockdep happy. */ - preempt_disable(); - if (!find_symbol(&fsa)) { - preempt_enable(); - BUG(); - } - preempt_enable(); + scoped_guard(rcu) + have_symbol = find_symbol(&fsa); + BUG_ON(!have_symbol); + return check_version(info, "module_layout", mod, fsa.crc); } diff --git a/kernel/panic.c b/kernel/panic.c index 0c55eec9e874..a3889f38153d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -833,9 +833,15 @@ device_initcall(register_warn_debugfs); */ __visible noinstr void __stack_chk_fail(void) { + unsigned long flags; + instrumentation_begin(); + flags = user_access_save(); + panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + + user_access_restore(flags); instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/params.c b/kernel/params.c index 0074d29c9b80..2509f216c9f3 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -551,7 +551,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[]; + struct param_attribute attrs[] __counted_by(num); }; #ifdef CONFIG_SYSFS @@ -651,35 +651,32 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, } /* Enlarge allocations. */ - new_mp = krealloc(mk->mp, - sizeof(*mk->mp) + - sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), + new_mp = krealloc(mk->mp, struct_size(mk->mp, attrs, mk->mp->num + 1), GFP_KERNEL); if (!new_mp) return -ENOMEM; mk->mp = new_mp; + mk->mp->num++; /* Extra pointer for NULL terminator */ - new_attrs = krealloc(mk->mp->grp.attrs, - sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), - GFP_KERNEL); + new_attrs = krealloc_array(mk->mp->grp.attrs, mk->mp->num + 1, + sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL); if (!new_attrs) return -ENOMEM; mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ - memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); - sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); - mk->mp->attrs[mk->mp->num].param = kp; - mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; + memset(&mk->mp->attrs[mk->mp->num - 1], 0, sizeof(mk->mp->attrs[0])); + sysfs_attr_init(&mk->mp->attrs[mk->mp->num - 1].mattr.attr); + mk->mp->attrs[mk->mp->num - 1].param = kp; + mk->mp->attrs[mk->mp->num - 1].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) - mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + mk->mp->attrs[mk->mp->num - 1].mattr.store = param_attr_store; else - mk->mp->attrs[mk->mp->num].mattr.store = NULL; - mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; - mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; - mk->mp->num++; + mk->mp->attrs[mk->mp->num - 1].mattr.store = NULL; + mk->mp->attrs[mk->mp->num - 1].mattr.attr.name = (char *)name; + mk->mp->attrs[mk->mp->num - 1].mattr.attr.mode = kp->perm; /* Fix up all the pointers, since krealloc can move us */ for (i = 0; i < mk->mp->num; i++) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index aa42de4d2768..4d9b21f69eaa 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -68,6 +68,8 @@ config TREE_SRCU config FORCE_NEED_SRCU_NMI_SAFE bool "Force selection of NEED_SRCU_NMI_SAFE" depends on !TINY_SRCU + depends on RCU_EXPERT + depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select NEED_SRCU_NMI_SAFE default n help diff --git a/kernel/reboot.c b/kernel/reboot.c index 41ab9e1ba357..ec087827c85c 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -36,6 +36,8 @@ enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; EXPORT_SYMBOL_GPL(reboot_mode); enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; +static enum hw_protection_action hw_protection_action = HWPROT_ACT_SHUTDOWN; + /* * This variable is used privately to keep track of whether or not * reboot_type is still set to its default value (i.e., reboot= hasn't @@ -229,6 +231,9 @@ EXPORT_SYMBOL(unregister_restart_handler); /** * do_kernel_restart - Execute kernel restart handler call chain * + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * * Calls functions registered with register_restart_handler. * * Expected to be called from machine_restart as last step of the restart @@ -933,61 +938,86 @@ void orderly_reboot(void) } EXPORT_SYMBOL_GPL(orderly_reboot); +static const char *hw_protection_action_str(enum hw_protection_action action) +{ + switch (action) { + case HWPROT_ACT_SHUTDOWN: + return "shutdown"; + case HWPROT_ACT_REBOOT: + return "reboot"; + default: + return "undefined"; + } +} + +static enum hw_protection_action hw_failure_emergency_action; + /** - * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay - * @work: work_struct associated with the emergency poweroff function + * hw_failure_emergency_action_func - emergency action work after a known delay + * @work: work_struct associated with the emergency action function * * This function is called in very critical situations to force - * a kernel poweroff after a configurable timeout value. + * a kernel poweroff or reboot after a configurable timeout value. */ -static void hw_failure_emergency_poweroff_func(struct work_struct *work) +static void hw_failure_emergency_action_func(struct work_struct *work) { + const char *action_str = hw_protection_action_str(hw_failure_emergency_action); + + pr_emerg("Hardware protection timed-out. Trying forced %s\n", + action_str); + /* - * We have reached here after the emergency shutdown waiting period has - * expired. This means orderly_poweroff has not been able to shut off - * the system for some reason. + * We have reached here after the emergency action waiting period has + * expired. This means orderly_poweroff/reboot has not been able to + * shut off the system for some reason. * - * Try to shut down the system immediately using kernel_power_off - * if populated + * Try to shut off the system immediately if possible */ - pr_emerg("Hardware protection timed-out. Trying forced poweroff\n"); - kernel_power_off(); + + if (hw_failure_emergency_action == HWPROT_ACT_REBOOT) + kernel_restart(NULL); + else + kernel_power_off(); /* * Worst of the worst case trigger emergency restart */ - pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); + pr_emerg("Hardware protection %s failed. Trying emergency restart\n", + action_str); emergency_restart(); } -static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, - hw_failure_emergency_poweroff_func); +static DECLARE_DELAYED_WORK(hw_failure_emergency_action_work, + hw_failure_emergency_action_func); /** - * hw_failure_emergency_poweroff - Trigger an emergency system poweroff + * hw_failure_emergency_schedule - Schedule an emergency system shutdown or reboot + * + * @action: The hardware protection action to be taken + * @action_delay_ms: Time in milliseconds to elapse before triggering action * * This may be called from any critical situation to trigger a system shutdown - * after a given period of time. If time is negative this is not scheduled. + * or reboot after a given period of time. + * If time is negative this is not scheduled. */ -static void hw_failure_emergency_poweroff(int poweroff_delay_ms) +static void hw_failure_emergency_schedule(enum hw_protection_action action, + int action_delay_ms) { - if (poweroff_delay_ms <= 0) + if (action_delay_ms <= 0) return; - schedule_delayed_work(&hw_failure_emergency_poweroff_work, - msecs_to_jiffies(poweroff_delay_ms)); + hw_failure_emergency_action = action; + schedule_delayed_work(&hw_failure_emergency_action_work, + msecs_to_jiffies(action_delay_ms)); } /** - * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot + * __hw_protection_trigger - Trigger an emergency system shutdown or reboot * * @reason: Reason of emergency shutdown or reboot to be printed. * @ms_until_forced: Time to wait for orderly shutdown or reboot before * triggering it. Negative value disables the forced * shutdown or reboot. - * @shutdown: If true, indicates that a shutdown will happen - * after the critical tempeature is reached. - * If false, indicates that a reboot will happen - * after the critical tempeature is reached. + * @action: The hardware protection action to be taken. * * Initiate an emergency system shutdown or reboot in order to protect * hardware from further damage. Usage examples include a thermal protection. @@ -995,11 +1025,16 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms) * pending even if the previous request has given a large timeout for forced * shutdown/reboot. */ -void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown) +void __hw_protection_trigger(const char *reason, int ms_until_forced, + enum hw_protection_action action) { static atomic_t allow_proceed = ATOMIC_INIT(1); - pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); + if (action == HWPROT_ACT_DEFAULT) + action = hw_protection_action; + + pr_emerg("HARDWARE PROTECTION %s (%s)\n", + hw_protection_action_str(action), reason); /* Shutdown should be initiated only once. */ if (!atomic_dec_and_test(&allow_proceed)) @@ -1009,13 +1044,55 @@ void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shut * Queue a backup emergency shutdown in the event of * orderly_poweroff failure */ - hw_failure_emergency_poweroff(ms_until_forced); - if (shutdown) + hw_failure_emergency_schedule(action, ms_until_forced); + if (action == HWPROT_ACT_REBOOT) + orderly_reboot(); + else orderly_poweroff(true); +} +EXPORT_SYMBOL_GPL(__hw_protection_trigger); + +static bool hw_protection_action_parse(const char *str, + enum hw_protection_action *action) +{ + if (sysfs_streq(str, "shutdown")) + *action = HWPROT_ACT_SHUTDOWN; + else if (sysfs_streq(str, "reboot")) + *action = HWPROT_ACT_REBOOT; else - orderly_reboot(); + return false; + + return true; +} + +static int __init hw_protection_setup(char *str) +{ + hw_protection_action_parse(str, &hw_protection_action); + return 1; +} +__setup("hw_protection=", hw_protection_setup); + +#ifdef CONFIG_SYSFS +static ssize_t hw_protection_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + hw_protection_action_str(hw_protection_action)); +} +static ssize_t hw_protection_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!hw_protection_action_parse(buf, &hw_protection_action)) + return -EINVAL; + + return count; } -EXPORT_SYMBOL_GPL(__hw_protection_shutdown); +static struct kobj_attribute hw_protection_attr = __ATTR_RW(hw_protection); +#endif static int __init reboot_setup(char *str) { @@ -1276,6 +1353,7 @@ static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu); #endif static struct attribute *reboot_attrs[] = { + &hw_protection_attr.attr, &reboot_mode_attr.attr, #ifdef CONFIG_X86 &reboot_force_attr.attr, diff --git a/kernel/relay.c b/kernel/relay.c index a8ae436dc77e..5ac7e711e4b6 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -351,10 +351,9 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, struct dentry *dentry; char *tmpname; - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + tmpname = kasprintf(GFP_KERNEL, "%s%d", chan->base_filename, cpu); if (!tmpname) return NULL; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); /* Create file in fs */ dentry = chan->cb->create_buf_file(tmpname, chan->parent, diff --git a/kernel/resource.c b/kernel/resource.c index 12004452d999..8d3e6ed0bdc1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -561,8 +561,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, struct resource res, o; bool covered; - res.start = start; - res.end = start + size - 1; + res = DEFINE_RES(start, size, 0); for (p = parent->child; p ; p = p->sibling) { if (!resource_intersection(p, &res, &o)) @@ -1714,18 +1713,13 @@ static int __init reserve_setup(char *str) * I/O port space; otherwise assume it's memory. */ if (io_start < 0x10000) { - res->flags = IORESOURCE_IO; + *res = DEFINE_RES_IO_NAMED(io_start, io_num, "reserved"); parent = &ioport_resource; } else { - res->flags = IORESOURCE_MEM; + *res = DEFINE_RES_MEM_NAMED(io_start, io_num, "reserved"); parent = &iomem_resource; } - res->name = "reserved"; - res->start = io_start; - res->end = io_start + io_num - 1; res->flags |= IORESOURCE_BUSY; - res->desc = IORES_DESC_NONE; - res->child = NULL; if (request_resource(parent, res) == 0) reserved = x+1; } @@ -1975,11 +1969,7 @@ get_free_mem_region(struct device *dev, struct resource *base, */ revoke_iomem(res); } else { - res->start = addr; - res->end = addr + size - 1; - res->name = name; - res->desc = desc; - res->flags = IORESOURCE_MEM; + *res = DEFINE_RES_NAMED_DESC(addr, size, name, IORESOURCE_MEM, desc); /* * Only succeed if the resource hosts an exclusive diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 21575d39c376..66bcd40a28ca 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4171,8 +4171,8 @@ static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) init_dsq(dsq, dsq_id); - ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, - dsq_hash_params); + ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node, + dsq_hash_params); if (ret) { kfree(dsq); return ERR_PTR(ret); @@ -5361,6 +5361,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ cpus_read_lock(); + scx_idle_enable(ops); + if (scx_ops.init) { ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); if (ret) { @@ -5427,8 +5429,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - scx_idle_enable(ops); - /* * Lock out forks, cgroup on/offlining and moves before opening the * floodgate so that they don't wander into the operations prematurely. diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 52c36a70a3d0..cb343ca889e0 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -544,7 +544,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * core. */ if (flags & SCX_PICK_IDLE_CORE) { - cpu = prev_cpu; + cpu = -EBUSY; goto out_unlock; } } @@ -584,8 +584,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * increasing distance. */ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); - if (cpu >= 0) - goto out_unlock; out_unlock: rcu_read_unlock(); @@ -723,14 +721,14 @@ static void reset_idle_masks(struct sched_ext_ops *ops) void scx_idle_enable(struct sched_ext_ops *ops) { if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) - static_branch_enable(&scx_builtin_idle_enabled); + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); else - static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) - static_branch_enable(&scx_builtin_idle_per_node); + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node); else - static_branch_disable(&scx_builtin_idle_per_node); + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); #ifdef CONFIG_SMP reset_idle_masks(ops); diff --git a/kernel/signal.c b/kernel/signal.c index 86ba66d95da5..f8859faa26c5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -176,9 +176,10 @@ static bool recalc_sigpending_tsk(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) - clear_thread_flag(TIF_SIGPENDING); - + if (!recalc_sigpending_tsk(current) && !freezing(current)) { + if (unlikely(test_thread_flag(TIF_SIGPENDING))) + clear_thread_flag(TIF_SIGPENDING); + } } EXPORT_SYMBOL(recalc_sigpending); @@ -2179,11 +2180,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); - /* - * Notify for thread-group leaders without subthreads. - */ - if (thread_group_empty(tsk)) - do_notify_pidfd(tsk); + + /* ptraced, or group-leader without sub-threads */ + do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 4dae6ac2e83f..513b1945987c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -126,6 +126,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key bh_lock_key; +struct lockdep_map bh_lock_map = { + .name = "local_bh", + .key = &bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ + .lock_type = LD_LOCK_PERCPU, +}; +EXPORT_SYMBOL_GPL(bh_lock_map); +#endif + /** * local_bh_blocked() - Check for idle whether BH processing is blocked * @@ -148,6 +160,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); + lock_map_acquire_read(&bh_lock_map); + /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { @@ -211,6 +225,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); + lock_map_release(&bh_lock_map); + local_irq_save(flags); curcnt = __this_cpu_read(softirq_ctrl.cnt); @@ -261,6 +277,8 @@ static inline void ksoftirqd_run_begin(void) /* Counterpart to ksoftirqd_run_begin() */ static inline void ksoftirqd_run_end(void) { + /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */ + lock_map_release(&bh_lock_map); __local_bh_enable(SOFTIRQ_OFFSET, true); WARN_ON_ONCE(in_interrupt()); local_irq_enable(); diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index a297790b7333..269683d41aa9 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -325,13 +325,12 @@ static int __static_call_mod_text_reserved(void *start, void *end) struct module *mod; int ret; - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - + scoped_guard(rcu) { + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + } if (!mod) return 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 033fba0633cf..a3f35c7d83b6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -265,8 +265,7 @@ config FUNCTION_GRAPH_RETADDR config FUNCTION_TRACE_ARGS bool - depends on HAVE_FUNCTION_ARG_ACCESS_API - depends on DEBUG_INFO_BTF + depends on PROBE_EVENTS_BTF_ARGS default y help If supported with function argument access API and BTF, then diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 997fb2a47c92..187dc37d61d4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -392,7 +392,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_SIZE, }; -static void __set_printk_clr_event(void) +static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, @@ -405,10 +405,11 @@ static void __set_printk_clr_event(void) if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } +static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } @@ -451,7 +452,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } @@ -606,6 +607,11 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; +const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void) +{ + return &bpf_perf_event_read_value_proto; +} + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw, @@ -843,7 +849,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc if (unlikely(is_global_init(task))) return -EPERM; - if (!preemptible()) { + if (preempt_count() != 0 || irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ @@ -2332,10 +2338,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address((unsigned long)btp); module_put(mod); - preempt_enable(); } static __always_inline @@ -2919,18 +2924,21 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3 u32 i, err = 0; for (i = 0; i < addrs_cnt; i++) { + bool skip_add = false; struct module *mod; - preempt_disable(); - mod = __module_address(addrs[i]); - /* Either no module or we it's already stored */ - if (!mod || has_module(&arr, mod)) { - preempt_enable(); - continue; + scoped_guard(rcu) { + mod = __module_address(addrs[i]); + /* Either no module or it's already stored */ + if (!mod || has_module(&arr, mod)) { + skip_add = true; + break; /* scoped_guard */ + } + if (!try_module_get(mod)) + err = -EINVAL; } - if (!try_module_get(mod)) - err = -EINVAL; - preempt_enable(); + if (skip_add) + continue; if (err) break; err = add_module(&arr, mod); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92015de6203d..1a48aedb5255 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6855,6 +6855,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } + cond_resched(); } while_for_each_ftrace_rec(); return fail ? -EINVAL : 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9d4d951090d3..c0f877d39a24 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -31,6 +31,7 @@ #include <asm/local64.h> #include <asm/local.h> +#include <asm/setup.h> #include "trace.h" @@ -48,9 +49,12 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { int magic; - int struct_size; - unsigned long text_addr; - unsigned long data_addr; + int struct_sizes; + unsigned long total_size; + unsigned long buffers_offset; +}; + +struct ring_buffer_cpu_meta { unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; @@ -517,7 +521,7 @@ struct ring_buffer_per_cpu { struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; - struct ring_buffer_meta *ring_meta; + struct ring_buffer_cpu_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -550,8 +554,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; - long last_text_delta; - long last_data_delta; + struct ring_buffer_meta *meta; unsigned int subbuf_size; unsigned int subbuf_order; @@ -1271,7 +1274,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } @@ -1569,7 +1572,7 @@ out_locked: static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { - addr += sizeof(struct ring_buffer_meta) + + addr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } @@ -1580,19 +1583,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; - unsigned long ptr = buffer->range_addr_start; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; + struct ring_buffer_meta *bmeta; + unsigned long ptr; int nr_subbufs; - if (!ptr) + bmeta = buffer->meta; + if (!bmeta) return NULL; + ptr = (unsigned long)bmeta + bmeta->buffers_offset; + meta = (struct ring_buffer_cpu_meta *)ptr; + /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { - meta = (struct ring_buffer_meta *)ptr; nr_subbufs = meta->nr_subbufs; } else { - meta = NULL; /* Include the reader page */ nr_subbufs = nr_pages + 1; } @@ -1624,7 +1630,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) } /* Return the start of subbufs given the meta pointer */ -static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; @@ -1640,7 +1646,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long ptr; int subbuf_size; @@ -1666,14 +1672,77 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) } /* + * See if the existing memory contains a valid meta section. + * if so, use that, otherwise initialize it. + */ +static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) +{ + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *bmeta; + unsigned long total_size; + int struct_sizes; + + bmeta = (struct ring_buffer_meta *)ptr; + buffer->meta = bmeta; + + total_size = buffer->range_addr_end - buffer->range_addr_start; + + struct_sizes = sizeof(struct ring_buffer_cpu_meta); + struct_sizes |= sizeof(*bmeta) << 16; + + /* The first buffer will start word size after the meta page */ + ptr += sizeof(*bmeta); + ptr = ALIGN(ptr, sizeof(long)); + ptr += scratch_size; + + if (bmeta->magic != RING_BUFFER_META_MAGIC) { + pr_info("Ring buffer boot meta mismatch of magic\n"); + goto init; + } + + if (bmeta->struct_sizes != struct_sizes) { + pr_info("Ring buffer boot meta mismatch of struct size\n"); + goto init; + } + + if (bmeta->total_size != total_size) { + pr_info("Ring buffer boot meta mismatch of total size\n"); + goto init; + } + + if (bmeta->buffers_offset > bmeta->total_size) { + pr_info("Ring buffer boot meta mismatch of offset outside of total size\n"); + goto init; + } + + if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) { + pr_info("Ring buffer boot meta mismatch of first buffer offset\n"); + goto init; + } + + return true; + + init: + bmeta->magic = RING_BUFFER_META_MAGIC; + bmeta->struct_sizes = struct_sizes; + bmeta->total_size = total_size; + bmeta->buffers_offset = (void *)ptr - (void *)bmeta; + + /* Zero out the scatch pad */ + memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); + + return false; +} + +/* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ -static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages, - unsigned long *subbuf_mask) +static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1684,20 +1753,6 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, if (!subbuf_mask) return false; - /* Check the meta magic and meta struct size */ - if (meta->magic != RING_BUFFER_META_MAGIC || - meta->struct_size != sizeof(*meta)) { - pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); - return false; - } - - /* The subbuffer's size and number of subbuffers must match */ - if (meta->subbuf_size != subbuf_size || - meta->nr_subbufs != nr_pages + 1) { - pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); - return false; - } - buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); @@ -1743,7 +1798,7 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return true; } -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) @@ -1810,7 +1865,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; struct buffer_page *head_page; unsigned long entry_bytes = 0; unsigned long entries = 0; @@ -1891,24 +1946,13 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } -/* Used to calculate data delta */ -static char rb_data_ptr[] = ""; - -#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) -#define THIS_DATA_PTR ((unsigned long)rb_data_ptr) - -static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) -{ - meta->text_addr = THIS_TEXT_PTR; - meta->data_addr = THIS_DATA_PTR; -} - -static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long *subbuf_mask; unsigned long delta; void *subbuf; + bool valid = false; int cpu; int i; @@ -1916,20 +1960,21 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + if (rb_meta_init(buffer, scratch_size)) + valid = true; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { + if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; - buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; - buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; continue; } @@ -1940,16 +1985,12 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) memset(meta, 0, next_meta - (void *)meta); - meta->magic = RING_BUFFER_META_MAGIC; - meta->struct_size = sizeof(*meta); - meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; - rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers @@ -1971,7 +2012,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) @@ -1996,7 +2037,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { @@ -2045,7 +2086,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; @@ -2060,7 +2101,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_meta *meta = NULL; + struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -2184,7 +2225,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; struct page *page; int ret; @@ -2313,6 +2354,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, + unsigned long scratch_size, struct lock_class_key *key) { struct trace_buffer *buffer; @@ -2355,10 +2397,23 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, /* If start/end are specified, then that overrides size */ if (start && end) { + unsigned long buffers_start; unsigned long ptr; int n; - size = end - start; + /* Make sure that start is word aligned */ + start = ALIGN(start, sizeof(long)); + + /* scratch_size needs to be aligned too */ + scratch_size = ALIGN(scratch_size, sizeof(long)); + + /* Subtract the buffer meta data and word aligned */ + buffers_start = start + sizeof(struct ring_buffer_cpu_meta); + buffers_start = ALIGN(buffers_start, sizeof(long)); + buffers_start += scratch_size; + + /* Calculate the size for the per CPU data */ + size = end - buffers_start; size = size / nr_cpu_ids; /* @@ -2368,7 +2423,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, * needed, plus account for the integer array index that * will be appended to the meta data. */ - nr_pages = (size - sizeof(struct ring_buffer_meta)) / + nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) @@ -2376,8 +2431,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, again: /* Make sure that the size fits aligned */ - for (n = 0, ptr = start; n < nr_cpu_ids; n++) { - ptr += sizeof(struct ring_buffer_meta) + + for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; @@ -2394,7 +2449,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_start = start; buffer->range_addr_end = end; - rb_range_meta_init(buffer, nr_pages); + rb_range_meta_init(buffer, nr_pages, scratch_size); } else { /* need at least two pages */ @@ -2447,7 +2502,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0,key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2459,6 +2514,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range + * @scratch_size: size of scratch area (for preallocated memory buffers) * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE @@ -2469,32 +2525,29 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, + unsigned long scratch_size, struct lock_class_key *key) { - return alloc_buffer(size, flags, order, start, start + range_size, key); + return alloc_buffer(size, flags, order, start, start + range_size, + scratch_size, key); } -/** - * ring_buffer_last_boot_delta - return the delta offset from last boot - * @buffer: The buffer to return the delta from - * @text: Return text delta - * @data: Return data delta - * - * Returns: The true if the delta is non zero - */ -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, - long *data) +void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) { - if (!buffer) - return false; + struct ring_buffer_meta *meta; + void *ptr; - if (!buffer->last_text_delta) - return false; + if (!buffer || !buffer->meta) + return NULL; - *text = buffer->last_text_delta; - *data = buffer->last_data_delta; + meta = buffer->meta; - return true; + ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long)); + + if (size) + *size = (void *)meta + meta->buffers_offset - ptr; + + return ptr; } /** @@ -3105,7 +3158,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) } /* Return the index into the sub-buffers for a given sub-buffer */ -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf) { void *subbuf_array; @@ -3117,7 +3170,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; @@ -3134,7 +3187,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; @@ -3763,7 +3816,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ @@ -5963,7 +6016,7 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) meta->read = cpu_buffer->read; /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); + flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); } static void @@ -6016,7 +6069,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } @@ -6050,7 +6103,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -6069,11 +6121,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -6088,7 +6135,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ @@ -6116,11 +6162,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) reset_disabled_cpu_buffer(cpu_buffer); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } @@ -7278,7 +7319,8 @@ consume: out: /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + flush_kernel_vmap_range(cpu_buffer->reader_page->page, + buffer->subbuf_size + BUF_PAGE_HDR_SIZE); rb_update_meta_page(cpu_buffer); @@ -7411,9 +7453,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) /* Ignore dropped events before test starts. */ if (started) { if (nested) - data->bytes_dropped += len; - else data->bytes_dropped_nested += len; + else + data->bytes_dropped += len; } return len; } diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 50344aa9f7f9..968c5c3b0246 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -809,7 +809,8 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) if (p && rv_is_nested_monitor(p)) { pr_info("Parent monitor %s is already nested, cannot nest further\n", parent->name); - return -EINVAL; + retval = -EINVAL; + goto out_unlock; } r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 826267f5b650..b581e388a9d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,8 @@ #include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> +#include <linux/sort.h> +#include <linux/io.h> /* vmap_page_range() */ #include <asm/setup.h> /* COMMAND_LINE_SIZE */ @@ -3340,10 +3342,9 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); -__printf(3, 0) -static int -__trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct ring_buffer_event *event; int len = 0, size; @@ -3393,7 +3394,6 @@ out_nobuffer: return len; } -__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { @@ -3423,7 +3423,6 @@ int trace_array_vprintk(struct trace_array *tr, * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ -__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3468,7 +3467,6 @@ int trace_array_init_printk(struct trace_array *tr) } EXPORT_SYMBOL_GPL(trace_array_init_printk); -__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3484,7 +3482,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer, return ret; } -__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(printk_trace, ip, fmt, args); @@ -4206,7 +4203,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) * safe to use if the array has delta offsets * Force printing via the fields. */ - if ((tr->text_delta || tr->data_delta) && + if ((tr->text_delta) && event->type > __TRACE_LAST_TYPE) return print_event_fields(iter, event); @@ -6001,11 +5998,130 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return __tracing_resize_ring_buffer(tr, size, cpu_id); } +struct trace_mod_entry { + unsigned long mod_addr; + char mod_name[MODULE_NAME_LEN]; +}; + +struct trace_scratch { + unsigned long text_addr; + unsigned long nr_entries; + struct trace_mod_entry entries[]; +}; + +static DEFINE_MUTEX(scratch_mutex); + +static int cmp_mod_entry(const void *key, const void *pivot) +{ + unsigned long addr = (unsigned long)key; + const struct trace_mod_entry *ent = pivot; + + if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) + return 0; + else + return addr - ent->mod_addr; +} + +/** + * trace_adjust_address() - Adjust prev boot address to current address. + * @tr: Persistent ring buffer's trace_array. + * @addr: Address in @tr which is adjusted. + */ +unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + int idx = 0, nr_entries; + + /* If we don't have last boot delta, return the address */ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return addr; + + /* tr->module_delta must be protected by rcu. */ + guard(rcu)(); + tscratch = tr->scratch; + /* if there is no tscrach, module_delta must be NULL. */ + module_delta = READ_ONCE(tr->module_delta); + if (!module_delta || tscratch->entries[0].mod_addr > addr) + return addr + tr->text_delta; + + /* Note that entries must be sorted. */ + nr_entries = tscratch->nr_entries; + if (nr_entries == 1 || + tscratch->entries[nr_entries - 1].mod_addr < addr) + idx = nr_entries - 1; + else { + entry = __inline_bsearch((void *)addr, + tscratch->entries, + nr_entries - 1, + sizeof(tscratch->entries[0]), + cmp_mod_entry); + if (entry) + idx = entry - tscratch->entries; + } + + return addr + module_delta->delta[idx]; +} + +#ifdef CONFIG_MODULES +static int save_mod(struct module *mod, void *data) +{ + struct trace_array *tr = data; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned int size; + + tscratch = tr->scratch; + if (!tscratch) + return -1; + size = tr->scratch_size; + + if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size) + return -1; + + entry = &tscratch->entries[tscratch->nr_entries]; + + tscratch->nr_entries++; + + entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base; + strscpy(entry->mod_name, mod->name); + + return 0; +} +#else +static int save_mod(struct module *mod, void *data) +{ + return 0; +} +#endif + static void update_last_data(struct trace_array *tr) { - if (!tr->text_delta && !tr->data_delta) + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + + if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) + return; + + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return; + /* Only if the buffer has previous boot data clear and update it. */ + tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + + /* Reset the module list and reload them */ + if (tr->scratch) { + struct trace_scratch *tscratch = tr->scratch; + + memset(tscratch->entries, 0, + flex_array_size(tscratch, entries, tscratch->nr_entries)); + tscratch->nr_entries = 0; + + guard(mutex)(&scratch_mutex); + module_for_each_mod(save_mod, tr); + } + /* * Need to clear all CPU buffers as there cannot be events * from the previous boot mixed with events with this boot @@ -6016,7 +6132,17 @@ static void update_last_data(struct trace_array *tr) /* Using current data now */ tr->text_delta = 0; - tr->data_delta = 0; + + if (!tr->scratch) + return; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + WRITE_ONCE(tr->module_delta, NULL); + kfree_rcu(module_delta, rcu); + + /* Set the persistent ring buffer meta data to this address */ + tscratch->text_addr = (unsigned long)_text; } /** @@ -6825,19 +6951,102 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +#define LAST_BOOT_HEADER ((void *)1) + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_array *tr = filp->private_data; - struct seq_buf seq; - char buf[64]; + struct trace_array *tr = m->private; + struct trace_scratch *tscratch = tr->scratch; + unsigned int index = *pos; + + (*pos)++; + + if (*pos == 1) + return LAST_BOOT_HEADER; + + /* Only show offsets of the last boot data */ + if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return NULL; + + /* *pos 0 is for the header, 1 is for the first module */ + index--; + + if (index >= tscratch->nr_entries) + return NULL; + + return &tscratch->entries[index]; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&scratch_mutex); + + return l_next(m, NULL, pos); +} + +static void l_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&scratch_mutex); +} + +static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) +{ + struct trace_scratch *tscratch = tr->scratch; + + /* + * Do not leak KASLR address. This only shows the KASLR address of + * the last boot. When the ring buffer is started, the LAST_BOOT + * flag gets cleared, and this should only report "current". + * Otherwise it shows the KASLR address from the previous boot which + * should not be the same as the current boot. + */ + if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr); + else + seq_puts(m, "# Current\n"); +} + +static int l_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + struct trace_mod_entry *entry = v; + + if (v == LAST_BOOT_HEADER) { + show_last_boot_header(m, tr); + return 0; + } + + seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name); + return 0; +} + +static const struct seq_operations last_boot_seq_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int tracing_last_boot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; - seq_buf_init(&seq, buf, 64); + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; - seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta); - seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta); + ret = seq_open(file, &last_boot_seq_ops); + if (ret) { + trace_array_put(tr); + return ret; + } - return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); + m = file->private_data; + m->private = tr; + + return 0; } static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) @@ -7466,10 +7675,10 @@ static const struct file_operations trace_time_stamp_mode_fops = { }; static const struct file_operations last_boot_fops = { - .open = tracing_open_generic_tr, - .read = tracing_last_boot_read, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, + .open = tracing_last_boot_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, }; #ifdef CONFIG_TRACER_SNAPSHOT @@ -8292,6 +8501,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* A memmap'ed buffer is not supported for user space mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) + return -ENODEV; + /* Currently the boot mapped buffer is not supported for mmap */ if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) return -ENODEV; @@ -9209,22 +9422,125 @@ static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); +#ifdef CONFIG_MODULES +static int make_mod_delta(struct module *mod, void *data) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + struct trace_array *tr = data; + int i; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + for (i = 0; i < tscratch->nr_entries; i++) { + entry = &tscratch->entries[i]; + if (strcmp(mod->name, entry->mod_name)) + continue; + if (mod->state == MODULE_STATE_GOING) + module_delta->delta[i] = 0; + else + module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base + - entry->mod_addr; + break; + } + return 0; +} +#else +static int make_mod_delta(struct module *mod, void *data) +{ + return 0; +} +#endif + +static int mod_addr_comp(const void *a, const void *b, const void *data) +{ + const struct trace_mod_entry *e1 = a; + const struct trace_mod_entry *e2 = b; + + return e1->mod_addr > e2->mod_addr ? 1 : -1; +} + +static void setup_trace_scratch(struct trace_array *tr, + struct trace_scratch *tscratch, unsigned int size) +{ + struct trace_module_delta *module_delta; + struct trace_mod_entry *entry; + int i, nr_entries; + + if (!tscratch) + return; + + tr->scratch = tscratch; + tr->scratch_size = size; + + if (tscratch->text_addr) + tr->text_delta = (unsigned long)_text - tscratch->text_addr; + + if (struct_size(tscratch, entries, tscratch->nr_entries) > size) + goto reset; + + /* Check if each module name is a valid string */ + for (i = 0; i < tscratch->nr_entries; i++) { + int n; + + entry = &tscratch->entries[i]; + + for (n = 0; n < MODULE_NAME_LEN; n++) { + if (entry->mod_name[n] == '\0') + break; + if (!isprint(entry->mod_name[n])) + goto reset; + } + if (n == MODULE_NAME_LEN) + goto reset; + } + + /* Sort the entries so that we can find appropriate module from address. */ + nr_entries = tscratch->nr_entries; + sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry), + mod_addr_comp, NULL, NULL); + + if (IS_ENABLED(CONFIG_MODULES)) { + module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL); + if (!module_delta) { + pr_info("module_delta allocation failed. Not able to decode module address."); + goto reset; + } + init_rcu_head(&module_delta->rcu); + } else + module_delta = NULL; + WRITE_ONCE(tr->module_delta, module_delta); + + /* Scan modules to make text delta for modules. */ + module_for_each_mod(make_mod_delta, tr); + return; + reset: + /* Invalid trace modules */ + memset(tscratch, 0, size); +} + static int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; + struct trace_scratch *tscratch; + unsigned int scratch_size = 0; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; if (tr->range_addr_start && tr->range_addr_size) { + /* Add scratch buffer to handle 128 modules */ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, - tr->range_addr_size); + tr->range_addr_size, + struct_size(tscratch, entries, 128)); + + tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); + setup_trace_scratch(tr, tscratch, scratch_size); - ring_buffer_last_boot_delta(buf->buffer, - &tr->text_delta, &tr->data_delta); /* * This is basically the same as a mapped buffer, * with the same restrictions. @@ -9293,6 +9609,7 @@ static void free_trace_buffers(struct trace_array *tr) return; free_trace_buffer(&tr->array_buffer); + kfree(tr->module_delta); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); @@ -9458,6 +9775,7 @@ trace_array_create_systems(const char *name, const char *systems, free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); + kfree(tr->range_name); kfree(tr->name); kfree(tr); @@ -9488,29 +9806,27 @@ static int instance_mkdir(const char *name) return ret; } -static u64 map_pages(u64 start, u64 size) +static u64 map_pages(unsigned long start, unsigned long size) { - struct page **pages; - phys_addr_t page_start; - unsigned int page_count; - unsigned int i; - void *vaddr; - - page_count = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long vmap_start, vmap_end; + struct vm_struct *area; + int ret; - page_start = start; - pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); - if (!pages) + area = get_vm_area(size, VM_IOREMAP); + if (!area) return 0; - for (i = 0; i < page_count; i++) { - phys_addr_t addr = page_start + i * PAGE_SIZE; - pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + vmap_start = (unsigned long) area->addr; + vmap_end = vmap_start + size; + + ret = vmap_page_range(vmap_start, vmap_end, + start, pgprot_nx(PAGE_KERNEL)); + if (ret < 0) { + free_vm_area(area); + return 0; } - vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL); - kfree(pages); - return (u64)(unsigned long)vaddr; + return (u64)vmap_start; } /** @@ -9584,6 +9900,11 @@ static int __remove_instance(struct trace_array *tr) free_trace_buffers(tr); clear_tracing_err_log(tr); + if (tr->range_name) { + reserve_mem_release_by_name(tr->range_name); + kfree(tr->range_name); + } + for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); } @@ -9905,6 +10226,24 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ +static void trace_module_record(struct module *mod, bool add) +{ + struct trace_array *tr; + unsigned long flags; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT); + /* Update any persistent trace array that has already been started */ + if (flags == TRACE_ARRAY_FL_BOOT && add) { + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); + } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) { + /* Update delta if the module loaded in previous boot */ + make_mod_delta(mod, tr); + } + } +} + static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -9913,9 +10252,11 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); + trace_module_record(mod, true); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); + trace_module_record(mod, false); break; } @@ -10364,6 +10705,7 @@ static inline void do_allocate_snapshot(const char *name) { } __init static void enable_instances(void) { struct trace_array *tr; + bool memmap_area = false; char *curr_str; char *name; char *str; @@ -10381,6 +10723,7 @@ __init static void enable_instances(void) bool traceoff = false; char *flag_delim; char *addr_delim; + char *rname __free(kfree) = NULL; tok = strsep(&curr_str, ","); @@ -10431,16 +10774,31 @@ __init static void enable_instances(void) name); continue; } + memmap_area = true; } else if (tok) { if (!reserve_mem_find_by_name(tok, &start, &size)) { start = 0; pr_warn("Failed to map boot instance %s to %s\n", name, tok); continue; } + rname = kstrdup(tok, GFP_KERNEL); } if (start) { - addr = map_pages(start, size); + /* Start and size must be page aligned */ + if (start & ~PAGE_MASK) { + pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start); + continue; + } + if (size & ~PAGE_MASK) { + pr_warn("Tracing: mapping size %pa is not page aligned\n", &size); + continue; + } + + if (memmap_area) + addr = map_pages(start, size); + else + addr = (unsigned long)phys_to_virt(start); if (addr) { pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", name, &start, (unsigned long)size); @@ -10467,15 +10825,18 @@ __init static void enable_instances(void) update_printk_trace(tr); /* - * If start is set, then this is a mapped buffer, and - * cannot be deleted by user space, so keep the reference - * to it. + * memmap'd buffers can not be freed. */ - if (start) { - tr->flags |= TRACE_ARRAY_FL_BOOT; + if (memmap_area) { + tr->flags |= TRACE_ARRAY_FL_MEMMAP; tr->ref++; } + if (start) { + tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; + tr->range_name = no_free_ptr(rname); + } + while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a6621e2a0fa..79be1995db44 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -313,6 +313,11 @@ struct trace_func_repeats { u64 ts_last_call; }; +struct trace_module_delta { + struct rcu_head rcu; + long delta[]; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -349,8 +354,13 @@ struct trace_array { unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; + char *range_name; long text_delta; - long data_delta; + struct trace_module_delta *module_delta; + void *scratch; /* pointer in persistent memory */ + int scratch_size; + + int buffer_disabled; struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; @@ -368,7 +378,6 @@ struct trace_array { * CONFIG_TRACER_MAX_TRACE. */ arch_spinlock_t max_lock; - int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; @@ -434,9 +443,11 @@ struct trace_array { }; enum { - TRACE_ARRAY_FL_GLOBAL = BIT(0), - TRACE_ARRAY_FL_BOOT = BIT(1), - TRACE_ARRAY_FL_MOD_INIT = BIT(2), + TRACE_ARRAY_FL_GLOBAL = BIT(0), + TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_LAST_BOOT = BIT(2), + TRACE_ARRAY_FL_MOD_INIT = BIT(3), + TRACE_ARRAY_FL_MEMMAP = BIT(4), }; #ifdef CONFIG_MODULES @@ -463,6 +474,8 @@ extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); +extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -785,6 +798,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); +extern int trace_events_enabled(struct trace_array *tr, const char *system); + #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; @@ -838,13 +853,15 @@ static inline void __init disable_tracing_selftest(const char *reason) extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); -extern int -trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_vprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args); + +__printf(2, 0) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8e7603acca21..069e92856bda 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call) case '%': continue; case 'p': + do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': @@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call) continue; if (fmt[i + j] == '*') { star = true; + /* Handle %*pbl case */ + if (!j && fmt[i + 1] == 'p') { + arg++; + i++; + goto do_pointer; + } continue; } if ((fmt[i + j] == 's')) { @@ -1820,28 +1827,28 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } -static ssize_t -system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +/* + * Returns: + * 0 : no events exist? + * 1 : all events are disabled + * 2 : all events are enabled + * 3 : some events are enabled and some are enabled + */ +int trace_events_enabled(struct trace_array *tr, const char *system) { - const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct trace_subsystem_dir *dir = filp->private_data; - struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; - struct trace_array *tr = dir->tr; - char buf[2]; int set = 0; - int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); + list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system->name) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -1857,7 +1864,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (set == 3) break; } - mutex_unlock(&event_mutex); + + return set; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; + char buf[2]; + int set; + int ret; + + set = trace_events_enabled(tr, system ? system->name : NULL); buf[0] = set_to_char[set]; buf[1] = '\n'; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8287b175667f..2703b96d8990 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -124,9 +124,8 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) if (!p) return true; *p = '\0'; - rcu_read_lock_sched(); - ret = !!find_module(tk->symbol); - rcu_read_unlock_sched(); + scoped_guard(rcu) + ret = !!find_module(tk->symbol); *p = ':'; return ret; @@ -796,12 +795,10 @@ static struct module *try_module_get_by_name(const char *name) { struct module *mod; - rcu_read_lock_sched(); + guard(rcu)(); mod = find_module(name); if (mod && !try_module_get(mod)) mod = NULL; - rcu_read_unlock_sched(); - return mod; } #else diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 72b699f909e8..fee40ffbd490 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -5,6 +5,7 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * */ +#include "trace.h" #include <linux/module.h> #include <linux/mutex.h> #include <linux/ftrace.h> @@ -1340,7 +1341,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; unsigned long *p; unsigned long *end; - long delta = iter->tr->text_delta; trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); @@ -1357,7 +1357,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); continue; } - seq_print_ip_sym(s, (*p) + delta, flags); + seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags); trace_seq_putc(s, '\n'); } diff --git a/kernel/ucount.c b/kernel/ucount.c index 86c5f1c0bad9..8686e329b8f2 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,11 +11,14 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = ATOMIC_INIT(1), + .count = RCUREF_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 -static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +#define UCOUNTS_HASHTABLE_ENTRIES (1 << UCOUNTS_HASHTABLE_BITS) +static struct hlist_nulls_head ucounts_hashtable[UCOUNTS_HASHTABLE_ENTRIES] = { + [0 ... UCOUNTS_HASHTABLE_ENTRIES - 1] = HLIST_NULLS_HEAD_INIT(0) +}; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ @@ -24,7 +27,6 @@ static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) - #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) @@ -127,88 +129,73 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } -static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, + struct hlist_nulls_head *hashent) { struct ucounts *ucounts; + struct hlist_nulls_node *pos; - hlist_for_each_entry(ucounts, hashent, node) { - if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) - return ucounts; + guard(rcu)(); + hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) { + if (rcuref_get(&ucounts->count)) + return ucounts; + } } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { - struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + struct hlist_nulls_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); - hlist_add_head(&ucounts->node, hashent); + hlist_nulls_add_head_rcu(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } -static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { - /* Returns true on a successful get, false if the count wraps. */ - return !atomic_add_negative(1, &ucounts->count); -} + struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - if (!get_ucounts_or_wrap(ucounts)) { - put_ucounts(ucounts); - ucounts = NULL; - } - return ucounts; -} + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) + return ucounts; -struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) -{ - struct hlist_head *hashent = ucounts_hashentry(ns, uid); - bool wrapped; - struct ucounts *ucounts, *new = NULL; + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + rcuref_init(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { + if (ucounts) { spin_unlock_irq(&ucounts_lock); - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - - new->ns = ns; - new->uid = uid; - atomic_set(&new->count, 1); - - spin_lock_irq(&ucounts_lock); - ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { - hlist_add_head(&new->node, hashent); - get_user_ns(new->ns); - spin_unlock_irq(&ucounts_lock); - return new; - } + kfree(new); + return ucounts; } - wrapped = !get_ucounts_or_wrap(ucounts); + hlist_nulls_add_head_rcu(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); - kfree(new); - if (wrapped) { - put_ucounts(ucounts); - return NULL; - } - return ucounts; + return new; } void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { - hlist_del_init(&ucounts->node); + if (rcuref_put(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); - kfree(ucounts); + kfree_rcu(ucounts, rcu); } } diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index a78ff092d636..75af12ff774e 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -269,12 +269,10 @@ void __init hardlockup_config_perf_event(const char *str) } else { unsigned int len = comma - str; - if (len >= sizeof(buf)) + if (len > sizeof(buf)) return; - if (strscpy(buf, str, sizeof(buf)) < 0) - return; - buf[len] = 0; + strscpy(buf, str, len); if (kstrtoull(buf, 16, &config)) return; } |
