From 987d0242d189661f78b77cc4d77f843b15600fed Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Apr 2023 00:14:10 -0700 Subject: bpf: Add bpf_dynptr_adjust Add a new kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end); which adjusts the dynptr to reflect the new [start, end) interval. In particular, it advances the offset of the dynptr by "start" bytes, and if end is less than the size of the dynptr, then this will trim the dynptr accordingly. Adjusting the dynptr interval may be useful in certain situations. For example, when hashing which takes in generic dynptrs, if the dynptr points to a struct but only a certain memory region inside the struct should be hashed, adjust can be used to narrow in on the specific region to hash. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230420071414.570108-2-joannelkoong@gmail.com --- kernel/bpf/helpers.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8d368fa353f9..42e06d1b492d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1448,6 +1448,13 @@ u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) return ptr->size & DYNPTR_SIZE_MASK; } +static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +{ + u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; + + ptr->size = new_size | metadata; +} + int bpf_dynptr_check_size(u32 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; @@ -2297,6 +2304,24 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk); } +__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end) +{ + u32 size; + + if (!ptr->data || start > end) + return -EINVAL; + + size = bpf_dynptr_get_size(ptr); + + if (start > size || end > size) + return -ERANGE; + + ptr->offset += start; + bpf_dynptr_set_size(ptr, end - start); + + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2369,6 +2394,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 540ccf96ddbc173474c32e595787d5622253be3d Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Apr 2023 00:14:11 -0700 Subject: bpf: Add bpf_dynptr_is_null and bpf_dynptr_is_rdonly bpf_dynptr_is_null returns true if the dynptr is null / invalid (determined by whether ptr->data is NULL), else false if the dynptr is a valid dynptr. bpf_dynptr_is_rdonly returns true if the dynptr is read-only, else false if the dynptr is read-writable. If the dynptr is null / invalid, false is returned by default. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20230420071414.570108-3-joannelkoong@gmail.com --- kernel/bpf/helpers.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 42e06d1b492d..a683b3e71a28 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1423,7 +1423,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) +static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1570,7 +1570,7 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v enum bpf_dynptr_type type; int err; - if (!dst->data || bpf_dynptr_is_rdonly(dst)) + if (!dst->data || __bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); @@ -1626,7 +1626,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 if (err) return 0; - if (bpf_dynptr_is_rdonly(ptr)) + if (__bpf_dynptr_is_rdonly(ptr)) return 0; type = bpf_dynptr_get_type(ptr); @@ -2276,7 +2276,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, void *buffer, u32 buffer__szk) { - if (!ptr->data || bpf_dynptr_is_rdonly(ptr)) + if (!ptr->data || __bpf_dynptr_is_rdonly(ptr)) return NULL; /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. @@ -2322,6 +2322,19 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en return 0; } +__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr) +{ + return !ptr->data; +} + +__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +{ + if (!ptr->data) + return false; + + return __bpf_dynptr_is_rdonly(ptr); +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2395,6 +2408,8 @@ BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) +BTF_ID_FLAGS(func, bpf_dynptr_is_null) +BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 26662d7347a058ca497792c4b22ac91cc415cbf6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Apr 2023 00:14:12 -0700 Subject: bpf: Add bpf_dynptr_size bpf_dynptr_size returns the number of usable bytes in a dynptr. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20230420071414.570108-4-joannelkoong@gmail.com --- kernel/bpf/helpers.c | 15 ++++++++++++--- kernel/trace/bpf_trace.c | 4 ++-- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a683b3e71a28..c465e97733b9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1443,7 +1443,7 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } -u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) +u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } @@ -1476,7 +1476,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { - u32 size = bpf_dynptr_get_size(ptr); + u32 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; @@ -2311,7 +2311,7 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en if (!ptr->data || start > end) return -EINVAL; - size = bpf_dynptr_get_size(ptr); + size = __bpf_dynptr_size(ptr); if (start > size || end > size) return -ERANGE; @@ -2335,6 +2335,14 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) return __bpf_dynptr_is_rdonly(ptr); } +__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +{ + if (!ptr->data) + return -EINVAL; + + return __bpf_dynptr_size(ptr); +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2410,6 +2418,7 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) +BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index bcf91bc7bf71..8deb22a99abe 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1349,9 +1349,9 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, } return verify_pkcs7_signature(data_ptr->data, - bpf_dynptr_get_size(data_ptr), + __bpf_dynptr_size(data_ptr), sig_ptr->data, - bpf_dynptr_get_size(sig_ptr), + __bpf_dynptr_size(sig_ptr), trusted_keyring->key, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); -- cgit v1.2.3 From 361f129f3cc185af6667aca0bec0be9a020a8abc Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Apr 2023 00:14:13 -0700 Subject: bpf: Add bpf_dynptr_clone The cloned dynptr will point to the same data as its parent dynptr, with the same type, offset, size and read-only properties. Any writes to a dynptr will be reflected across all instances (by 'instance', this means any dynptrs that point to the same underlying data). Please note that data slice and dynptr invalidations will affect all instances as well. For example, if bpf_dynptr_write() is called on an skb-type dynptr, all data slices of dynptr instances to that skb will be invalidated as well (eg data slices of any clones, parents, grandparents, ...). Another example is if a ringbuf dynptr is submitted, any instance of that dynptr will be invalidated. Changing the view of the dynptr (eg advancing the offset or trimming the size) will only affect that dynptr and not affect any other instances. One example use case where cloning may be helpful is for hashing or iterating through dynptr data. Cloning will allow the user to maintain the original view of the dynptr for future use, while also allowing views to smaller subsets of the data after the offset is advanced or the size is trimmed. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230420071414.570108-5-joannelkoong@gmail.com --- kernel/bpf/helpers.c | 14 +++++++ kernel/bpf/verifier.c | 103 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 97 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c465e97733b9..bb6b4637ebf2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2343,6 +2343,19 @@ __bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) return __bpf_dynptr_size(ptr); } +__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr, + struct bpf_dynptr_kern *clone__uninit) +{ + if (!ptr->data) { + bpf_dynptr_set_null(clone__uninit); + return -EINVAL; + } + + *clone__uninit = *ptr; + + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2419,6 +2432,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) +BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fbcf5a4e2fcd..ff4a8ab99f08 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -309,6 +309,7 @@ struct bpf_kfunc_call_arg_meta { struct { enum bpf_dynptr_type type; u32 id; + u32 ref_obj_id; } initialized_dynptr; struct { u8 spi; @@ -847,11 +848,11 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type, int insn_idx) + enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type type; - int spi, i, id, err; + int spi, i, err; spi = dynptr_get_spi(env, reg); if (spi < 0) @@ -887,7 +888,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (dynptr_type_refcounted(type)) { /* The id is used to track proper releasing */ - id = acquire_reference_state(env, insn_idx); + int id; + + if (clone_ref_obj_id) + id = clone_ref_obj_id; + else + id = acquire_reference_state(env, insn_idx); + if (id < 0) return id; @@ -901,24 +908,15 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ return 0; } -static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { - struct bpf_func_state *state = func(env, reg); - int spi, i; - - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; + int i; for (i = 0; i < BPF_REG_SIZE; i++) { state->stack[spi].slot_type[i] = STACK_INVALID; state->stack[spi - 1].slot_type[i] = STACK_INVALID; } - /* Invalidate any slices associated with this dynptr */ - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) - WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); - __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); @@ -945,6 +943,50 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re */ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; +} + +static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, ref_obj_id, i; + + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + + if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { + invalidate_dynptr(env, state, spi); + return 0; + } + + ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; + + /* If the dynptr has a ref_obj_id, then we need to invalidate + * two things: + * + * 1) Any dynptrs with a matching ref_obj_id (clones) + * 2) Any slices derived from this dynptr. + */ + + /* Invalidate any slices associated with this dynptr */ + WARN_ON_ONCE(release_reference(env, ref_obj_id)); + + /* Invalidate any dynptr clones */ + for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id) + continue; + + /* it should always be the case that if the ref obj id + * matches then the stack slot also belongs to a + * dynptr + */ + if (state->stack[i].slot_type[0] != STACK_DYNPTR) { + verbose(env, "verifier internal error: misconfigured ref_obj_id\n"); + return -EFAULT; + } + if (state->stack[i].spilled_ptr.dynptr.first_slot) + invalidate_dynptr(env, state, i); + } return 0; } @@ -6677,7 +6719,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * type, and declare it as 'const struct bpf_dynptr *' in their prototype. */ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, - enum bpf_arg_type arg_type) + enum bpf_arg_type arg_type, int clone_ref_obj_id) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; int err; @@ -6721,7 +6763,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return err; } - err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx); + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id); } else /* MEM_RDONLY and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { @@ -7631,7 +7673,7 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, regno, insn_idx, arg_type); + err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); if (err) return err; break; @@ -9595,6 +9637,7 @@ enum special_kfunc_type { KF_bpf_dynptr_from_xdp, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, + KF_bpf_dynptr_clone, }; BTF_SET_START(special_kfunc_set) @@ -9614,6 +9657,7 @@ BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) +BTF_ID(func, bpf_dynptr_clone) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -9635,6 +9679,7 @@ BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) +BTF_ID(func, bpf_dynptr_clone) static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { @@ -10330,6 +10375,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_DYNPTR: { enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; + int clone_ref_obj_id = 0; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { @@ -10343,12 +10389,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; - if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { dynptr_arg_type |= DYNPTR_TYPE_SKB; - else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && + (dynptr_arg_type & MEM_UNINIT)) { + enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; + + if (parent_type == BPF_DYNPTR_TYPE_INVALID) { + verbose(env, "verifier internal error: no dynptr type for parent of clone\n"); + return -EFAULT; + } + + dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); + clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; + if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { + verbose(env, "verifier internal error: missing ref obj id for parent of clone\n"); + return -EFAULT; + } + } - ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type); + ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); if (ret < 0) return ret; @@ -10361,6 +10423,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } meta->initialized_dynptr.id = id; meta->initialized_dynptr.type = dynptr_get_type(env, reg); + meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg); } break; -- cgit v1.2.3 From fedf99200ab086c42a572fca1d7266b06cdc3e3f Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 2 May 2023 11:14:18 -0700 Subject: bpf: Print a warning only if writing to unprivileged_bpf_disabled. Only print the warning message if you are writing to "/proc/sys/kernel/unprivileged_bpf_disabled". The kernel may print an annoying warning when you read "/proc/sys/kernel/unprivileged_bpf_disabled" saying WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks! However, this message is only meaningful when the feature is disabled or enabled. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20230502181418.308479-1-kuifeng@meta.com --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 14f39c1e573e..909c112ef537 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5380,7 +5380,8 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, *(int *)table->data = unpriv_enable; } - unpriv_ebpf_notify(unpriv_enable); + if (write) + unpriv_ebpf_notify(unpriv_enable); return ret; } -- cgit v1.2.3 From e0bf462276b6ee23203365eacb5c599f42a5a084 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 May 2023 21:33:09 -0700 Subject: bpf: mark relevant stack slots scratched for register read instructions When handling instructions that read register slots, mark relevant stack slots as scratched so that verifier log would contain those slots' states, in addition to currently emitted registers with stack slot offsets. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230505043317.3629845-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff4a8ab99f08..da8a5834f2ca 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4109,6 +4109,7 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, for (i = min_off; i < max_off; i++) { slot = -i - 1; spi = slot / BPF_REG_SIZE; + mark_stack_slot_scratched(env, spi); stype = ptr_state->stack[spi].slot_type; if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) break; @@ -4160,6 +4161,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; + mark_stack_slot_scratched(env, spi); + if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; -- cgit v1.2.3 From 407958a0e980b9e1842ab87b5a1040521e1e24e9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 May 2023 21:33:10 -0700 Subject: bpf: encapsulate precision backtracking bookkeeping Add struct backtrack_state and straightforward API around it to keep track of register and stack masks used and maintained during precision backtracking process. Having this logic separately allow to keep high-level backtracking algorithm cleaner, but also it sets us up to cleanly keep track of register and stack masks per frame, allowing (with some further logic adjustments) to perform precision backpropagation across multiple frames (i.e., subprog calls). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230505043317.3629845-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 249 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 182 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index da8a5834f2ca..9b2e571250e1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1296,6 +1296,12 @@ static bool is_spilled_reg(const struct bpf_stack_state *stack) return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL; } +static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) +{ + return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL && + stack->spilled_ptr.type == SCALAR_VALUE; +} + static void scrub_spilled_slot(u8 *stype) { if (*stype != STACK_INVALID) @@ -3186,12 +3192,128 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } +static inline void bt_init(struct backtrack_state *bt, u32 frame) +{ + bt->frame = frame; +} + +static inline void bt_reset(struct backtrack_state *bt) +{ + struct bpf_verifier_env *env = bt->env; + + memset(bt, 0, sizeof(*bt)); + bt->env = env; +} + +static inline u32 bt_empty(struct backtrack_state *bt) +{ + u64 mask = 0; + int i; + + for (i = 0; i <= bt->frame; i++) + mask |= bt->reg_masks[i] | bt->stack_masks[i]; + + return mask == 0; +} + +static inline int bt_subprog_enter(struct backtrack_state *bt) +{ + if (bt->frame == MAX_CALL_FRAMES - 1) { + verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + bt->frame++; + return 0; +} + +static inline int bt_subprog_exit(struct backtrack_state *bt) +{ + if (bt->frame == 0) { + verbose(bt->env, "BUG subprog exit from frame 0\n"); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + bt->frame--; + return 0; +} + +static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) +{ + bt->reg_masks[frame] |= 1 << reg; +} + +static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) +{ + bt->reg_masks[frame] &= ~(1 << reg); +} + +static inline void bt_set_reg(struct backtrack_state *bt, u32 reg) +{ + bt_set_frame_reg(bt, bt->frame, reg); +} + +static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg) +{ + bt_clear_frame_reg(bt, bt->frame, reg); +} + +static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_masks[frame] |= 1ull << slot; +} + +static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_masks[frame] &= ~(1ull << slot); +} + +static inline void bt_set_slot(struct backtrack_state *bt, u32 slot) +{ + bt_set_frame_slot(bt, bt->frame, slot); +} + +static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot) +{ + bt_clear_frame_slot(bt, bt->frame, slot); +} + +static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame) +{ + return bt->reg_masks[frame]; +} + +static inline u32 bt_reg_mask(struct backtrack_state *bt) +{ + return bt->reg_masks[bt->frame]; +} + +static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame) +{ + return bt->stack_masks[frame]; +} + +static inline u64 bt_stack_mask(struct backtrack_state *bt) +{ + return bt->stack_masks[bt->frame]; +} + +static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) +{ + return bt->reg_masks[bt->frame] & (1 << reg); +} + +static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot) +{ + return bt->stack_masks[bt->frame] & (1ull << slot); +} + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, - u32 *reg_mask, u64 *stack_mask) + struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -3202,20 +3324,20 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, u8 class = BPF_CLASS(insn->code); u8 opcode = BPF_OP(insn->code); u8 mode = BPF_MODE(insn->code); - u32 dreg = 1u << insn->dst_reg; - u32 sreg = 1u << insn->src_reg; + u32 dreg = insn->dst_reg; + u32 sreg = insn->src_reg; u32 spi; if (insn->code == 0) return 0; if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); + verbose(env, "regs=%x stack=%llx before ", bt_reg_mask(bt), bt_stack_mask(bt)); verbose(env, "%d: ", idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (class == BPF_ALU || class == BPF_ALU64) { - if (!(*reg_mask & dreg)) + if (!bt_is_reg_set(bt, dreg)) return 0; if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { @@ -3223,8 +3345,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, * dreg needs precision after this insn * sreg needs precision before this insn */ - *reg_mask &= ~dreg; - *reg_mask |= sreg; + bt_clear_reg(bt, dreg); + bt_set_reg(bt, sreg); } else { /* dreg = K * dreg needs precision after this insn. @@ -3232,7 +3354,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, * as precise=true in this verifier state. * No further markings in parent are necessary */ - *reg_mask &= ~dreg; + bt_clear_reg(bt, dreg); } } else { if (BPF_SRC(insn->code) == BPF_X) { @@ -3240,15 +3362,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, * both dreg and sreg need precision * before this insn */ - *reg_mask |= sreg; + bt_set_reg(bt, sreg); } /* else dreg += K * dreg still needs precision before this insn */ } } else if (class == BPF_LDX) { - if (!(*reg_mask & dreg)) + if (!bt_is_reg_set(bt, dreg)) return 0; - *reg_mask &= ~dreg; + bt_clear_reg(bt, dreg); /* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. @@ -3269,9 +3391,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - *stack_mask |= 1ull << spi; + bt_set_slot(bt, spi); } else if (class == BPF_STX || class == BPF_ST) { - if (*reg_mask & dreg) + if (bt_is_reg_set(bt, dreg)) /* stx & st shouldn't be using _scalar_ dst_reg * to access memory. It means backtracking * encountered a case of pointer subtraction. @@ -3286,11 +3408,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - if (!(*stack_mask & (1ull << spi))) + if (!bt_is_slot_set(bt, spi)) return 0; - *stack_mask &= ~(1ull << spi); + bt_clear_slot(bt, spi); if (class == BPF_STX) - *reg_mask |= sreg; + bt_set_reg(bt, sreg); } else if (class == BPF_JMP || class == BPF_JMP32) { if (opcode == BPF_CALL) { if (insn->src_reg == BPF_PSEUDO_CALL) @@ -3307,19 +3429,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) return -ENOTSUPP; /* regular helper call sets R0 */ - *reg_mask &= ~1; - if (*reg_mask & 0x3f) { + bt_clear_reg(bt, BPF_REG_0); + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { /* if backtracing was looking for registers R1-R5 * they should have been found already. */ - verbose(env, "BUG regs %x\n", *reg_mask); + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } } else if (opcode == BPF_EXIT) { return -ENOTSUPP; } else if (BPF_SRC(insn->code) == BPF_X) { - if (!(*reg_mask & (dreg | sreg))) + if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg)) return 0; /* dreg sreg * Both dreg and sreg need precision before @@ -3327,7 +3449,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, * before it would be equally necessary to * propagate it to dreg. */ - *reg_mask |= (sreg | dreg); + bt_set_reg(bt, dreg); + bt_set_reg(bt, sreg); /* else dreg K * Only dreg still needs precision before * this insn, so for the K-based conditional @@ -3335,9 +3458,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, */ } } else if (class == BPF_LD) { - if (!(*reg_mask & dreg)) + if (!bt_is_reg_set(bt, dreg)) return 0; - *reg_mask &= ~dreg; + bt_clear_reg(bt, dreg); /* It's ld_imm64 or ld_abs or ld_ind. * For ld_imm64 no further tracking of precision * into parent is necessary @@ -3550,20 +3673,21 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, int spi) { + struct backtrack_state *bt = &env->bt; struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; int last_idx = env->insn_idx; struct bpf_func_state *func; struct bpf_reg_state *reg; - u32 reg_mask = regno >= 0 ? 1u << regno : 0; - u64 stack_mask = spi >= 0 ? 1ull << spi : 0; bool skip_first = true; - bool new_marks = false; int i, err; if (!env->bpf_capable) return 0; + /* set frame number from which we are starting to backtrack */ + bt_init(bt, frame); + /* Do sanity checks against current state of register and/or stack * slot, but don't set precise flag in current state, as precision * tracking in the current state is unnecessary. @@ -3575,26 +3699,17 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r WARN_ONCE(1, "backtracing misuse"); return -EFAULT; } - new_marks = true; + bt_set_reg(bt, regno); } while (spi >= 0) { - if (!is_spilled_reg(&func->stack[spi])) { - stack_mask = 0; + if (!is_spilled_scalar_reg(&func->stack[spi])) break; - } - reg = &func->stack[spi].spilled_ptr; - if (reg->type != SCALAR_VALUE) { - stack_mask = 0; - break; - } - new_marks = true; + bt_set_slot(bt, spi); break; } - if (!new_marks) - return 0; - if (!reg_mask && !stack_mask) + if (bt_empty(bt)) return 0; for (;;) { @@ -3613,12 +3728,13 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r if (st->curframe == 0 && st->frame[0]->subprogno > 0 && st->frame[0]->callsite == BPF_MAIN_FUNC && - stack_mask == 0 && (reg_mask & ~0x3e) == 0) { - bitmap_from_u64(mask, reg_mask); + bt_stack_mask(bt) == 0 && + (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) { + bitmap_from_u64(mask, bt_reg_mask(bt)); for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; if (reg->type != SCALAR_VALUE) { - reg_mask &= ~(1u << i); + bt_clear_reg(bt, i); continue; } reg->precise = true; @@ -3626,8 +3742,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r return 0; } - verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n", - st->frame[0]->subprogno, reg_mask, stack_mask); + verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n", + st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } @@ -3637,15 +3753,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r err = 0; skip_first = false; } else { - err = backtrack_insn(env, i, ®_mask, &stack_mask); + err = backtrack_insn(env, i, bt); } if (err == -ENOTSUPP) { mark_all_scalars_precise(env, st); + bt_reset(bt); return 0; } else if (err) { return err; } - if (!reg_mask && !stack_mask) + if (bt_empty(bt)) /* Found assignment(s) into tracked register in this state. * Since this state is already marked, just return. * Nothing to be tracked further in the parent state. @@ -3670,21 +3787,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r if (!st) break; - new_marks = false; func = st->frame[frame]; - bitmap_from_u64(mask, reg_mask); + bitmap_from_u64(mask, bt_reg_mask(bt)); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; if (reg->type != SCALAR_VALUE) { - reg_mask &= ~(1u << i); + bt_clear_reg(bt, i); continue; } - if (!reg->precise) - new_marks = true; - reg->precise = true; + if (reg->precise) + bt_clear_reg(bt, i); + else + reg->precise = true; } - bitmap_from_u64(mask, stack_mask); + bitmap_from_u64(mask, bt_stack_mask(bt)); for_each_set_bit(i, mask, 64) { if (i >= func->allocated_stack / BPF_REG_SIZE) { /* the sequence of instructions: @@ -3701,32 +3818,28 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r * In such case fallback to conservative. */ mark_all_scalars_precise(env, st); + bt_reset(bt); return 0; } - if (!is_spilled_reg(&func->stack[i])) { - stack_mask &= ~(1ull << i); + if (!is_spilled_scalar_reg(&func->stack[i])) { + bt_clear_slot(bt, i); continue; } reg = &func->stack[i].spilled_ptr; - if (reg->type != SCALAR_VALUE) { - stack_mask &= ~(1ull << i); - continue; - } - if (!reg->precise) - new_marks = true; - reg->precise = true; + if (reg->precise) + bt_clear_slot(bt, i); + else + reg->precise = true; } if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "parent %s regs=%x stack=%llx marks:", - new_marks ? "didn't have" : "already had", - reg_mask, stack_mask); + !bt_empty(bt) ? "didn't have" : "already had", + bt_reg_mask(bt), bt_stack_mask(bt)); print_verifier_state(env, func, true); } - if (!reg_mask && !stack_mask) - break; - if (!new_marks) + if (bt_empty(bt)) break; last_idx = st->last_insn_idx; @@ -18872,6 +18985,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (!env) return -ENOMEM; + env->bt.env = env; + len = (*prog)->len; env->insn_aux_data = vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len)); -- cgit v1.2.3 From d9439c21a9e4769bfd83a03ab39056164d44ac31 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 May 2023 21:33:11 -0700 Subject: bpf: improve precision backtrack logging Add helper to format register and stack masks in more human-readable format. Adjust logging a bit during backtrack propagation and especially during forcing precision fallback logic to make it clearer what's going on (with log_level=2, of course), and also start reporting affected frame depth. This is in preparation for having more than one active frame later when precision propagation between subprog calls is added. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230505043317.3629845-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b2e571250e1..5412c8c8511d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -605,9 +605,9 @@ static const char *reg_type_str(struct bpf_verifier_env *env, type & PTR_TRUSTED ? "trusted_" : "" ); - snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", + snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); - return env->type_str_buf; + return env->tmp_str_buf; } static char slot_type_char[] = { @@ -3308,6 +3308,45 @@ static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot) return bt->stack_masks[bt->frame] & (1ull << slot); } +/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ +static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) +{ + DECLARE_BITMAP(mask, 64); + bool first = true; + int i, n; + + buf[0] = '\0'; + + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i); + first = false; + buf += n; + buf_sz -= n; + if (buf_sz < 0) + break; + } +} +/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ +static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) +{ + DECLARE_BITMAP(mask, 64); + bool first = true; + int i, n; + + buf[0] = '\0'; + + bitmap_from_u64(mask, stack_mask); + for_each_set_bit(i, mask, 64) { + n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8); + first = false; + buf += n; + buf_sz -= n; + if (buf_sz < 0) + break; + } +} + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -3331,7 +3370,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (insn->code == 0) return 0; if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "regs=%x stack=%llx before ", bt_reg_mask(bt), bt_stack_mask(bt)); + fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); + verbose(env, "mark_precise: frame%d: regs=%s ", + bt->frame, env->tmp_str_buf); + fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); + verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } @@ -3531,6 +3574,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, struct bpf_reg_state *reg; int i, j; + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n", + st->curframe); + } + /* big hammer: mark all scalars precise in this path. * pop_stack may still get !precise scalars. * We also skip current state and go straight to first parent state, @@ -3542,17 +3590,25 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { reg = &func->regs[j]; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE || reg->precise) continue; reg->precise = true; + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "force_precise: frame%d: forcing r%d to be precise\n", + i, j); + } } for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { if (!is_spilled_reg(&func->stack[j])) continue; reg = &func->stack[j].spilled_ptr; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE || reg->precise) continue; reg->precise = true; + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n", + i, -(j + 1) * 8); + } } } } @@ -3716,8 +3772,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d\n", + bt->frame, last_idx, first_idx); + } if (last_idx < 0) { /* we are at the entry into subprog, which -- cgit v1.2.3 From 1ef22b6865a73a8aed36d43375fe8c7b30869326 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 May 2023 21:33:12 -0700 Subject: bpf: maintain bitmasks across all active frames in __mark_chain_precision Teach __mark_chain_precision logic to maintain register/stack masks across all active frames when going from child state to parent state. Currently this should be mostly no-op, as precision backtracking usually bails out when encountering subprog entry/exit. It's not very apparent from the diff due to increased indentation, but the logic remains the same, except everything is done on specific `fr` frame index. Calls to bt_clear_reg() and bt_clear_slot() are replaced with frame-specific bt_clear_frame_reg() and bt_clear_frame_slot(), where frame index is passed explicitly, instead of using current frame number. We also adjust logging to emit affected frame number. And we also add better logging of human-readable register and stack slot masks, similar to previous patch. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230505043317.3629845-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 100 ++++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5412c8c8511d..5a7997bc96f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3736,7 +3736,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r struct bpf_func_state *func; struct bpf_reg_state *reg; bool skip_first = true; - int i, err; + int i, fr, err; if (!env->bpf_capable) return 0; @@ -3845,56 +3845,62 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r if (!st) break; - func = st->frame[frame]; - bitmap_from_u64(mask, bt_reg_mask(bt)); - for_each_set_bit(i, mask, 32) { - reg = &func->regs[i]; - if (reg->type != SCALAR_VALUE) { - bt_clear_reg(bt, i); - continue; + for (fr = bt->frame; fr >= 0; fr--) { + func = st->frame[fr]; + bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (reg->type != SCALAR_VALUE) { + bt_clear_frame_reg(bt, fr, i); + continue; + } + if (reg->precise) + bt_clear_frame_reg(bt, fr, i); + else + reg->precise = true; } - if (reg->precise) - bt_clear_reg(bt, i); - else - reg->precise = true; - } - bitmap_from_u64(mask, bt_stack_mask(bt)); - for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) { - /* the sequence of instructions: - * 2: (bf) r3 = r10 - * 3: (7b) *(u64 *)(r3 -8) = r0 - * 4: (79) r4 = *(u64 *)(r10 -8) - * doesn't contain jmps. It's backtracked - * as a single block. - * During backtracking insn 3 is not recognized as - * stack access, so at the end of backtracking - * stack slot fp-8 is still marked in stack_mask. - * However the parent state may not have accessed - * fp-8 and it's "unallocated" stack space. - * In such case fallback to conservative. - */ - mark_all_scalars_precise(env, st); - bt_reset(bt); - return 0; - } + bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) { + /* the sequence of instructions: + * 2: (bf) r3 = r10 + * 3: (7b) *(u64 *)(r3 -8) = r0 + * 4: (79) r4 = *(u64 *)(r10 -8) + * doesn't contain jmps. It's backtracked + * as a single block. + * During backtracking insn 3 is not recognized as + * stack access, so at the end of backtracking + * stack slot fp-8 is still marked in stack_mask. + * However the parent state may not have accessed + * fp-8 and it's "unallocated" stack space. + * In such case fallback to conservative. + */ + mark_all_scalars_precise(env, st); + bt_reset(bt); + return 0; + } - if (!is_spilled_scalar_reg(&func->stack[i])) { - bt_clear_slot(bt, i); - continue; + if (!is_spilled_scalar_reg(&func->stack[i])) { + bt_clear_frame_slot(bt, fr, i); + continue; + } + reg = &func->stack[i].spilled_ptr; + if (reg->precise) + bt_clear_frame_slot(bt, fr, i); + else + reg->precise = true; + } + if (env->log.level & BPF_LOG_LEVEL2) { + fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bt_frame_reg_mask(bt, fr)); + verbose(env, "mark_precise: frame%d: parent state regs=%s ", + fr, env->tmp_str_buf); + fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bt_frame_stack_mask(bt, fr)); + verbose(env, "stack=%s: ", env->tmp_str_buf); + print_verifier_state(env, func, true); } - reg = &func->stack[i].spilled_ptr; - if (reg->precise) - bt_clear_slot(bt, i); - else - reg->precise = true; - } - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "parent %s regs=%x stack=%llx marks:", - !bt_empty(bt) ? "didn't have" : "already had", - bt_reg_mask(bt), bt_stack_mask(bt)); - print_verifier_state(env, func, true); } if (bt_empty(bt)) -- cgit v1.2.3 From f655badf2a8fc028433d9583bf86a6b473721f09 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 May 2023 21:33:13 -0700 Subject: bpf: fix propagate_precision() logic for inner frames Fix propagate_precision() logic to perform propagation of all necessary registers and stack slots across all active frames *in one batch step*. Doing this for each register/slot in each individual frame is wasteful, but the main problem is that backtracking of instruction in any frame except the deepest one just doesn't work. This is due to backtracking logic relying on jump history, and available jump history always starts (or ends, depending how you view it) in current frame. So, if prog A (frame #0) called subprog B (frame #1) and we need to propagate precision of, say, register R6 (callee-saved) within frame #0, we actually don't even know where jump history that corresponds to prog A even starts. We'd need to skip subprog part of jump history first to be able to do this. Luckily, with struct backtrack_state and __mark_chain_precision() handling bitmasks tracking/propagation across all active frames at the same time (added in previous patch), propagate_precision() can be both fixed and sped up by setting all the necessary bits across all frames and then performing one __mark_chain_precision() pass. This makes it unnecessary to skip subprog parts of jump history. We also improve logging along the way, to clearly specify which registers' and slots' precision markings are propagated within which frame. Each frame will have dedicated line and all registers and stack slots from that frame will be reported in format similar to precision backtrack regs/stack logging. E.g.: frame 1: propagating r1,r2,r3,fp-8,fp-16 frame 0: propagating r3,r9,fp-120 Fixes: 529409ea92d5 ("bpf: propagate precision across all frames, not just the last one") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230505043317.3629845-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 65 +++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5a7997bc96f5..13bbaa2485fc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3726,8 +3726,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * mark_all_scalars_imprecise() to hopefully get more permissive and generic * finalized states which help in short circuiting more future states. */ -static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, - int spi) +static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) { struct backtrack_state *bt = &env->bt; struct bpf_verifier_state *st = env->cur_state; @@ -3742,13 +3741,13 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r return 0; /* set frame number from which we are starting to backtrack */ - bt_init(bt, frame); + bt_init(bt, env->cur_state->curframe); /* Do sanity checks against current state of register and/or stack * slot, but don't set precise flag in current state, as precision * tracking in the current state is unnecessary. */ - func = st->frame[frame]; + func = st->frame[bt->frame]; if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { @@ -3758,13 +3757,6 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r bt_set_reg(bt, regno); } - while (spi >= 0) { - if (!is_spilled_scalar_reg(&func->stack[spi])) - break; - bt_set_slot(bt, spi); - break; - } - if (bt_empty(bt)) return 0; @@ -3914,17 +3906,15 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, env->cur_state->curframe, regno, -1); -} - -static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno) -{ - return __mark_chain_precision(env, frame, regno, -1); + return __mark_chain_precision(env, regno); } -static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi) +/* mark_chain_precision_batch() assumes that env->bt is set in the caller to + * desired reg and stack masks across all relevant frames + */ +static int mark_chain_precision_batch(struct bpf_verifier_env *env) { - return __mark_chain_precision(env, frame, -1, spi); + return __mark_chain_precision(env, -1); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -15361,20 +15351,25 @@ static int propagate_precision(struct bpf_verifier_env *env, struct bpf_reg_state *state_reg; struct bpf_func_state *state; int i, err = 0, fr; + bool first; for (fr = old->curframe; fr >= 0; fr--) { state = old->frame[fr]; state_reg = state->regs; + first = true; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || !state_reg->precise || !(state_reg->live & REG_LIVE_READ)) continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "frame %d: propagating r%d\n", fr, i); - err = mark_chain_precision_frame(env, fr, i); - if (err < 0) - return err; + if (env->log.level & BPF_LOG_LEVEL2) { + if (first) + verbose(env, "frame %d: propagating r%d", fr, i); + else + verbose(env, ",r%d", i); + } + bt_set_frame_reg(&env->bt, fr, i); + first = false; } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { @@ -15385,14 +15380,24 @@ static int propagate_precision(struct bpf_verifier_env *env, !state_reg->precise || !(state_reg->live & REG_LIVE_READ)) continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "frame %d: propagating fp%d\n", - fr, (-i - 1) * BPF_REG_SIZE); - err = mark_chain_precision_stack_frame(env, fr, i); - if (err < 0) - return err; + if (env->log.level & BPF_LOG_LEVEL2) { + if (first) + verbose(env, "frame %d: propagating fp%d", + fr, (-i - 1) * BPF_REG_SIZE); + else + verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE); + } + bt_set_frame_slot(&env->bt, fr, i); + first = false; } + if (!first) + verbose(env, "\n"); } + + err = mark_chain_precision_batch(env); + if (err < 0) + return err; + return 0; } -- cgit v1.2.3 From c50c0b57a515826b5d2e1ce85cd85f24f0da10c2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 May 2023 21:33:14 -0700 Subject: bpf: fix mark_all_scalars_precise use in mark_chain_precision When precision backtracking bails out due to some unsupported sequence of instructions (e.g., stack access through register other than r10), we need to mark all SCALAR registers as precise to be safe. Currently, though, we mark SCALARs precise only starting from the state we detected unsupported condition, which could be one of the parent states of the actual current state. This will leave some registers potentially not marked as precise, even though they should. So make sure we start marking scalars as precise from current state (env->cur_state). Further, we don't currently detect a situation when we end up with some stack slots marked as needing precision, but we ran out of available states to find the instructions that populate those stack slots. This is akin the `i >= func->allocated_stack / BPF_REG_SIZE` check and should be handled similarly by falling back to marking all SCALARs precise. Add this check when we run out of states. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230505043317.3629845-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 13bbaa2485fc..899122832d8e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3806,7 +3806,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = backtrack_insn(env, i, bt); } if (err == -ENOTSUPP) { - mark_all_scalars_precise(env, st); + mark_all_scalars_precise(env, env->cur_state); bt_reset(bt); return 0; } else if (err) { @@ -3868,7 +3868,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * fp-8 and it's "unallocated" stack space. * In such case fallback to conservative. */ - mark_all_scalars_precise(env, st); + mark_all_scalars_precise(env, env->cur_state); bt_reset(bt); return 0; } @@ -3896,11 +3896,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) } if (bt_empty(bt)) - break; + return 0; last_idx = st->last_insn_idx; first_idx = st->first_insn_idx; } + + /* if we still have requested precise regs or slots, we missed + * something (e.g., stack access through non-r10 register), so + * fallback to marking all precise + */ + if (!bt_empty(bt)) { + mark_all_scalars_precise(env, env->cur_state); + bt_reset(bt); + } + return 0; } -- cgit v1.2.3 From fde2a3882bd07876c144f2e00f7ae6893c378180 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 May 2023 21:33:15 -0700 Subject: bpf: support precision propagation in the presence of subprogs Add support precision backtracking in the presence of subprogram frames in jump history. This means supporting a few different kinds of subprogram invocation situations, all requiring a slightly different handling in precision backtracking handling logic: - static subprogram calls; - global subprogram calls; - callback-calling helpers/kfuncs. For each of those we need to handle a few precision propagation cases: - what to do with precision of subprog returns (r0); - what to do with precision of input arguments; - for all of them callee-saved registers in caller function should be propagated ignoring subprog/callback part of jump history. N.B. Async callback-calling helpers (currently only bpf_timer_set_callback()) are transparent to all this because they set a separate async callback environment and thus callback's history is not shared with main program's history. So as far as all the changes in this commit goes, such helper is just a regular helper. Let's look at all these situation in more details. Let's start with static subprogram being called, using an exxerpt of a simple main program and its static subprog, indenting subprog's frame slightly to make everything clear. frame 0 frame 1 precision set ======= ======= ============= 9: r6 = 456; 10: r1 = 123; fr0: r6 11: call pc+10; fr0: r1, r6 22: r0 = r1; fr0: r6; fr1: r1 23: exit fr0: r6; fr1: r0 12: r1 = fr0: r0, r6 13: r1 += r0; fr0: r0, r6 14: r1 += r6; fr0: r6 15: exit As can be seen above main function is passing 123 as single argument to an identity (`return x;`) subprog. Returned value is used to adjust map pointer offset, which forces r0 to be marked as precise. Then instruction #14 does the same for callee-saved r6, which will have to be backtracked all the way to instruction #9. For brevity, precision sets for instruction #13 and #14 are combined in the diagram above. First, for subprog calls, r0 returned from subprog (in frame 0) has to go into subprog's frame 1, and should be cleared from frame 0. So we go back into subprog's frame knowing we need to mark r0 precise. We then see that insn #22 sets r0 from r1, so now we care about marking r1 precise. When we pop up from subprog's frame back into caller at insn #11 we keep r1, as it's an argument-passing register, so we eventually find `10: r1 = 123;` and satify precision propagation chain for insn #13. This example demonstrates two sets of rules: - r0 returned after subprog call has to be moved into subprog's r0 set; - *static* subprog arguments (r1-r5) are moved back to caller precision set. Let's look at what happens with callee-saved precision propagation. Insn #14 mark r6 as precise. When we get into subprog's frame, we keep r6 in frame 0's precision set *only*. Subprog itself has its own set of independent r6-r10 registers and is not affected. When we eventually made our way out of subprog frame we keep r6 in precision set until we reach `9: r6 = 456;`, satisfying propagation. r6-r10 propagation is perhaps the simplest aspect, it always stays in its original frame. That's pretty much all we have to do to support precision propagation across *static subprog* invocation. Let's look at what happens when we have global subprog invocation. frame 0 frame 1 precision set ======= ======= ============= 9: r6 = 456; 10: r1 = 123; fr0: r6 11: call pc+10; # global subprog fr0: r6 12: r1 = fr0: r0, r6 13: r1 += r0; fr0: r0, r6 14: r1 += r6; fr0: r6; 15: exit Starting from insn #13, r0 has to be precise. We backtrack all the way to insn #11 (call pc+10) and see that subprog is global, so was already validated in isolation. As opposed to static subprog, global subprog always returns unknown scalar r0, so that satisfies precision propagation and we drop r0 from precision set. We are done for insns #13. Now for insn #14. r6 is in precision set, we backtrack to `call pc+10;`. Here we need to recognize that this is effectively both exit and entry to global subprog, which means we stay in caller's frame. So we carry on with r6 still in precision set, until we satisfy it at insn #9. The only hard part with global subprogs is just knowing when it's a global func. Lastly, callback-calling helpers and kfuncs do simulate subprog calls, so jump history will have subprog instructions in between caller program's instructions, but the rules of propagating r0 and r1-r5 differ, because we don't actually directly call callback. We actually call helper/kfunc, which at runtime will call subprog, so the only difference between normal helper/kfunc handling is that we need to make sure to skip callback simulatinog part of jump history. Let's look at an example to make this clearer. frame 0 frame 1 precision set ======= ======= ============= 8: r6 = 456; 9: r1 = 123; fr0: r6 10: r2 = &callback; fr0: r6 11: call bpf_loop; fr0: r6 22: r0 = r1; fr0: r6 fr1: 23: exit fr0: r6 fr1: 12: r1 = fr0: r0, r6 13: r1 += r0; fr0: r0, r6 14: r1 += r6; fr0: r6; 15: exit Again, insn #13 forces r0 to be precise. As soon as we get to `23: exit` we see that this isn't actually a static subprog call (it's `call bpf_loop;` helper call instead). So we clear r0 from precision set. For callee-saved register, there is no difference: it stays in frame 0's precision set, we go through insn #22 and #23, ignoring them until we get back to caller frame 0, eventually satisfying precision backtrack logic at insn #8 (`r6 = 456;`). Assuming callback needed to set r0 as precise at insn #23, we'd backtrack to insn #22, switching from r0 to r1, and then at the point when we pop back to frame 0 at insn #11, we'll clear r1-r5 from precision set, as we don't really do a subprog call directly, so there is no input argument precision propagation. That's pretty much it. With these changes, it seems like the only still unsupported situation for precision backpropagation is the case when program is accessing stack through registers other than r10. This is still left as unsupported (though rare) case for now. As for results. For selftests, few positive changes for bigger programs, cls_redirect in dynptr variant benefitting the most: [vmuser@archvm bpf]$ ./veristat -C ~/subprog-precise-before-results.csv ~/subprog-precise-after-results.csv -f @veristat.cfg -e file,prog,insns -f 'insns_diff!=0' File Program Insns (A) Insns (B) Insns (DIFF) ---------------------------------------- ------------- --------- --------- ---------------- pyperf600_bpf_loop.bpf.linked1.o on_event 2060 2002 -58 (-2.82%) test_cls_redirect_dynptr.bpf.linked1.o cls_redirect 15660 2914 -12746 (-81.39%) test_cls_redirect_subprogs.bpf.linked1.o cls_redirect 61620 59088 -2532 (-4.11%) xdp_synproxy_kern.bpf.linked1.o syncookie_tc 109980 86278 -23702 (-21.55%) xdp_synproxy_kern.bpf.linked1.o syncookie_xdp 97716 85147 -12569 (-12.86%) Cilium progress don't really regress. They don't use subprogs and are mostly unaffected, but some other fixes and improvements could have changed something. This doesn't appear to be the case: [vmuser@archvm bpf]$ ./veristat -C ~/subprog-precise-before-results-cilium.csv ~/subprog-precise-after-results-cilium.csv -e file,prog,insns -f 'insns_diff!=0' File Program Insns (A) Insns (B) Insns (DIFF) ------------- ------------------------------ --------- --------- ------------ bpf_host.o tail_nodeport_nat_ingress_ipv6 4983 5003 +20 (+0.40%) bpf_lxc.o tail_nodeport_nat_ingress_ipv6 4983 5003 +20 (+0.40%) bpf_overlay.o tail_nodeport_nat_ingress_ipv6 4983 5003 +20 (+0.40%) bpf_xdp.o tail_handle_nat_fwd_ipv6 12475 12504 +29 (+0.23%) bpf_xdp.o tail_nodeport_nat_ingress_ipv6 6363 6371 +8 (+0.13%) Looking at (somewhat anonymized) Meta production programs, we see mostly insignificant variation in number of instructions, with one program (syar_bind6_protect6) benefitting the most at -17%. [vmuser@archvm bpf]$ ./veristat -C ~/subprog-precise-before-results-fbcode.csv ~/subprog-precise-after-results-fbcode.csv -e prog,insns -f 'insns_diff!=0' Program Insns (A) Insns (B) Insns (DIFF) ------------------------ --------- --------- ---------------- on_request_context_event 597 585 -12 (-2.01%) read_async_py_stack 43789 43657 -132 (-0.30%) read_sync_py_stack 35041 37599 +2558 (+7.30%) rrm_usdt 946 940 -6 (-0.63%) sysarmor_inet6_bind 28863 28249 -614 (-2.13%) sysarmor_inet_bind 28845 28240 -605 (-2.10%) syar_bind4_protect4 154145 147640 -6505 (-4.22%) syar_bind6_protect6 165242 137088 -28154 (-17.04%) syar_task_exit_setgid 21289 19720 -1569 (-7.37%) syar_task_exit_setuid 21290 19721 -1569 (-7.37%) do_uprobe 19967 19413 -554 (-2.77%) tw_twfw_ingress 215877 204833 -11044 (-5.12%) tw_twfw_tc_in 215877 204833 -11044 (-5.12%) But checking duration (wall clock) differences, that is the actual time taken by verifier to validate programs, we see a sometimes dramatic improvements, all the way to about 16x improvements: [vmuser@archvm bpf]$ ./veristat -C ~/subprog-precise-before-results-meta.csv ~/subprog-precise-after-results-meta.csv -e prog,duration -s duration_diff^ | head -n20 Program Duration (us) (A) Duration (us) (B) Duration (us) (DIFF) ---------------------------------------- ----------------- ----------------- -------------------- tw_twfw_ingress 4488374 272836 -4215538 (-93.92%) tw_twfw_tc_in 4339111 268175 -4070936 (-93.82%) tw_twfw_egress 3521816 270751 -3251065 (-92.31%) tw_twfw_tc_eg 3472878 284294 -3188584 (-91.81%) balancer_ingress 343119 291391 -51728 (-15.08%) syar_bind6_protect6 78992 64782 -14210 (-17.99%) ttls_tc_ingress 11739 8176 -3563 (-30.35%) kprobe__security_inode_link 13864 11341 -2523 (-18.20%) read_sync_py_stack 21927 19442 -2485 (-11.33%) read_async_py_stack 30444 28136 -2308 (-7.58%) syar_task_exit_setuid 10256 8440 -1816 (-17.71%) Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230505043317.3629845-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 163 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 143 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 899122832d8e..0fa96581eb77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -240,6 +240,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static bool bpf_helper_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0; +} + static bool bpf_pseudo_call(const struct bpf_insn *insn) { return insn->code == (BPF_JMP | BPF_CALL) && @@ -469,6 +475,13 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } +static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) +{ + struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; + + return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL; +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -516,6 +529,8 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } +static bool is_callback_calling_kfunc(u32 btf_id); + static bool is_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || @@ -525,6 +540,11 @@ static bool is_callback_calling_function(enum bpf_func_id func_id) func_id == BPF_FUNC_user_ringbuf_drain; } +static bool is_async_callback_calling_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_timer_set_callback; +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -3350,8 +3370,13 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. + * + * @idx is an index of the instruction we are currently processing; + * @subseq_idx is an index of the subsequent instruction that: + * - *would be* executed next, if jump history is viewed in forward order; + * - *was* processed previously during backtracking. */ -static int backtrack_insn(struct bpf_verifier_env *env, int idx, +static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { @@ -3365,7 +3390,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, u8 mode = BPF_MODE(insn->code); u32 dreg = insn->dst_reg; u32 sreg = insn->src_reg; - u32 spi; + u32 spi, i; if (insn->code == 0) return 0; @@ -3457,14 +3482,86 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (class == BPF_STX) bt_set_reg(bt, sreg); } else if (class == BPF_JMP || class == BPF_JMP32) { - if (opcode == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - return -ENOTSUPP; - /* BPF helpers that invoke callback subprogs are - * equivalent to BPF_PSEUDO_CALL above + if (bpf_pseudo_call(insn)) { + int subprog_insn_idx, subprog; + + subprog_insn_idx = idx + insn->imm + 1; + subprog = find_subprog(env, subprog_insn_idx); + if (subprog < 0) + return -EFAULT; + + if (subprog_is_global(env, subprog)) { + /* check that jump history doesn't have any + * extra instructions from subprog; the next + * instruction after call to global subprog + * should be literally next instruction in + * caller program + */ + WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug"); + /* r1-r5 are invalidated after subprog call, + * so for global func call it shouldn't be set + * anymore + */ + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + /* global subprog always sets R0 */ + bt_clear_reg(bt, BPF_REG_0); + return 0; + } else { + /* static subprog call instruction, which + * means that we are exiting current subprog, + * so only r1-r5 could be still requested as + * precise, r0 and r6-r10 or any stack slot in + * the current frame should be zero by now + */ + if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + /* we don't track register spills perfectly, + * so fallback to force-precise instead of failing */ + if (bt_stack_mask(bt) != 0) + return -ENOTSUPP; + /* propagate r1-r5 to the caller */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) { + if (bt_is_reg_set(bt, i)) { + bt_clear_reg(bt, i); + bt_set_frame_reg(bt, bt->frame - 1, i); + } + } + if (bt_subprog_exit(bt)) + return -EFAULT; + return 0; + } + } else if ((bpf_helper_call(insn) && + is_callback_calling_function(insn->imm) && + !is_async_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) { + /* callback-calling helper or kfunc call, which means + * we are exiting from subprog, but unlike the subprog + * call handling above, we shouldn't propagate + * precision of r1-r5 (if any requested), as they are + * not actually arguments passed directly to callback + * subprogs */ - if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) + if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + if (bt_stack_mask(bt) != 0) return -ENOTSUPP; + /* clear r1-r5 in callback subprog's mask */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + bt_clear_reg(bt, i); + if (bt_subprog_exit(bt)) + return -EFAULT; + return 0; + } else if (opcode == BPF_CALL) { /* kfunc with imm==0 is invalid and fixup_kfunc_call will * catch this error later. Make backtracking conservative * with ENOTSUPP. @@ -3482,7 +3579,39 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, return -EFAULT; } } else if (opcode == BPF_EXIT) { - return -ENOTSUPP; + bool r0_precise; + + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { + /* if backtracing was looking for registers R1-R5 + * they should have been found already. + */ + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + + /* BPF_EXIT in subprog or callback always returns + * right after the call instruction, so by checking + * whether the instruction at subseq_idx-1 is subprog + * call or not we can distinguish actual exit from + * *subprog* from exit from *callback*. In the former + * case, we need to propagate r0 precision, if + * necessary. In the former we never do that. + */ + r0_precise = subseq_idx - 1 >= 0 && + bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) && + bt_is_reg_set(bt, BPF_REG_0); + + bt_clear_reg(bt, BPF_REG_0); + if (bt_subprog_enter(bt)) + return -EFAULT; + + if (r0_precise) + bt_set_reg(bt, BPF_REG_0); + /* r6-r9 and stack slots will stay set in caller frame + * bitmasks until we return back from callee(s) + */ + return 0; } else if (BPF_SRC(insn->code) == BPF_X) { if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg)) return 0; @@ -3735,7 +3864,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) struct bpf_func_state *func; struct bpf_reg_state *reg; bool skip_first = true; - int i, fr, err; + int i, prev_i, fr, err; if (!env->bpf_capable) return 0; @@ -3798,12 +3927,12 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) return -EFAULT; } - for (i = last_idx;;) { + for (i = last_idx, prev_i = -1;;) { if (skip_first) { err = 0; skip_first = false; } else { - err = backtrack_insn(env, i, bt); + err = backtrack_insn(env, i, prev_i, bt); } if (err == -ENOTSUPP) { mark_all_scalars_precise(env, env->cur_state); @@ -3820,6 +3949,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; if (i == first_idx) break; + prev_i = i; i = get_prev_insn_idx(st, i, &history); if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 @@ -8400,17 +8530,13 @@ static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); -static bool is_callback_calling_kfunc(u32 btf_id); - static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) { struct bpf_verifier_state *state = env->cur_state; - struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; int err; - bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -8425,13 +8551,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - func_info_aux = env->prog->aux->func_info_aux; - if (func_info_aux) - is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; - if (is_global) { + if (subprog_is_global(env, subprog)) { if (err) { verbose(env, "Caller passes invalid args into func#%d\n", subprog); -- cgit v1.2.3 From b5ad4cdc46c7d6e7f8d2c9e24b6c9a1edec95154 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Sat, 6 May 2023 11:15:44 +0800 Subject: bpf: Add bpf_task_under_cgroup() kfunc Add a kfunc that's similar to the bpf_current_task_under_cgroup. The difference is that it is a designated task. When hook sched related functions, sometimes it is necessary to specify a task instead of the current task. Signed-off-by: Feng Zhou Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230506031545.35991-2-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb6b4637ebf2..a128fe0ab2d0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2149,6 +2149,22 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) return NULL; return cgrp; } + +/** + * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test + * task's membership of cgroup ancestry. + * @task: the task to be tested + * @ancestor: possible ancestor of @task's cgroup + * + * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. + * It follows all the same rules as cgroup_is_descendant, and only applies + * to the default hierarchy. + */ +__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, + struct cgroup *ancestor) +{ + return task_under_cgroup_hierarchy(task, ancestor); +} #endif /* CONFIG_CGROUPS */ /** @@ -2400,6 +2416,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_SET8_END(generic_btf_ids) -- cgit v1.2.3 From 3bda08b63670c39be390fcb00e7718775508e673 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 5 May 2023 18:31:30 -0700 Subject: bpf: Allow NULL buffers in bpf_dynptr_slice(_rw) bpf_dynptr_slice(_rw) uses a user provided buffer if it can not provide a pointer to a block of contiguous memory. This buffer is unused in the case of local dynptrs, and may be unused in other cases as well. There is no need to require the buffer, as the kfunc can just return NULL if it was needed and not provided. This adds another kfunc annotation, __opt, which combines with __sz and __szk to allow the buffer associated with the size to be NULL. If the buffer is NULL, the verifier does not check that the buffer is of sufficient size. Signed-off-by: Daniel Rosenberg Link: https://lore.kernel.org/r/20230506013134.2492210-2-drosen@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 ++++++++++++++++++------------ kernel/bpf/verifier.c | 17 +++++++++++++---- 2 files changed, 31 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a128fe0ab2d0..4ef4c4f8a355 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2190,13 +2190,15 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @ptr: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer: User-provided buffer to copy contents into - * @buffer__szk: Size (in bytes) of the buffer. This is the length of the - * requested slice. This must be a constant. + * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__szk: Size (in bytes) of the buffer if present. This is the + * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * + * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. * @@ -2213,7 +2215,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset, - void *buffer, u32 buffer__szk) + void *buffer__opt, u32 buffer__szk) { enum bpf_dynptr_type type; u32 len = buffer__szk; @@ -2233,15 +2235,17 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: - return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer); + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); case BPF_DYNPTR_TYPE_XDP: { void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); if (xdp_ptr) return xdp_ptr; - bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false); - return buffer; + if (!buffer__opt) + return NULL; + bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); + return buffer__opt; } default: WARN_ONCE(true, "unknown dynptr type %d\n", type); @@ -2253,13 +2257,15 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @ptr: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer: User-provided buffer to copy contents into - * @buffer__szk: Size (in bytes) of the buffer. This is the length of the - * requested slice. This must be a constant. + * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__szk: Size (in bytes) of the buffer if present. This is the + * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * + * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct * data pointer to (example: the requested slice is to the paged area of an skb @@ -2290,7 +2296,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, - void *buffer, u32 buffer__szk) + void *buffer__opt, u32 buffer__szk) { if (!ptr->data || __bpf_dynptr_is_rdonly(ptr)) return NULL; @@ -2317,7 +2323,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ - return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk); + return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0fa96581eb77..7e6bbae9db81 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9743,6 +9743,11 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, return __kfunc_param_match_suffix(btf, arg, "__szk"); } +static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__opt"); +} + static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return __kfunc_param_match_suffix(btf, arg, "__k"); @@ -10830,13 +10835,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_MEM_SIZE: { + struct bpf_reg_state *buff_reg = ®s[regno]; + const struct btf_param *buff_arg = &args[i]; struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; - ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); - if (ret < 0) { - verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); - return ret; + if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) { + ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); + if (ret < 0) { + verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); + return ret; + } } if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { -- cgit v1.2.3 From 2012c867c8005d72c949e274133df429ece78808 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 5 May 2023 18:31:33 -0700 Subject: bpf: verifier: Accept dynptr mem as mem in helpers This allows using memory retrieved from dynptrs with helper functions that accept ARG_PTR_TO_MEM. For instance, results from bpf_dynptr_data can be passed along to bpf_strncmp. Signed-off-by: Daniel Rosenberg Link: https://lore.kernel.org/r/20230506013134.2492210-5-drosen@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7e6bbae9db81..754129d41225 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7495,12 +7495,16 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL * + * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type. + * * Therefore we fold these flags depending on the arg_type before comparison. */ if (arg_type & MEM_RDONLY) type &= ~MEM_RDONLY; if (arg_type & PTR_MAYBE_NULL) type &= ~PTR_MAYBE_NULL; + if (base_type(arg_type) == ARG_PTR_TO_MEM) + type &= ~DYNPTR_TYPE_FLAG_MASK; if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC) type &= ~MEM_ALLOC; -- cgit v1.2.3 From 0d6d062ca27ec7ef547712d34dcfcfb952bcef53 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 4 May 2023 16:30:00 +0530 Subject: perf/core: Rework forwarding of {task|cpu}-clock events Currently, PERF_TYPE_SOFTWARE is treated specially since task-clock and cpu-clock events are interfaced through it but internally gets forwarded to their own pmus. Rework this by overwriting event->attr.type in perf_swevent_init() which will cause perf_init_event() to retry with updated type and event will automatically get forwarded to right pmu. With the change, SW pmu no longer needs to be treated specially and can be included in 'pmu_idr' list. Suggested-by: Peter Zijlstra Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230504110003.2548-2-ravi.bangoria@amd.com --- kernel/events/core.c | 77 ++++++++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 68baa8194d9f..c01bbe93e291 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6647,7 +6647,7 @@ static void perf_sigtrap(struct perf_event *event) return; send_sig_perf((void __user *)event->pending_addr, - event->attr.type, event->attr.sig_data); + event->orig_type, event->attr.sig_data); } /* @@ -9951,6 +9951,9 @@ static void sw_perf_event_destroy(struct perf_event *event) swevent_hlist_put(); } +static struct pmu perf_cpu_clock; /* fwd declaration */ +static struct pmu perf_task_clock; + static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; @@ -9966,7 +9969,10 @@ static int perf_swevent_init(struct perf_event *event) switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: + event->attr.type = perf_cpu_clock.type; + return -ENOENT; case PERF_COUNT_SW_TASK_CLOCK: + event->attr.type = perf_task_clock.type; return -ENOENT; default: @@ -11086,7 +11092,7 @@ static void cpu_clock_event_read(struct perf_event *event) static int cpu_clock_event_init(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_SOFTWARE) + if (event->attr.type != perf_cpu_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) @@ -11107,6 +11113,7 @@ static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, + .dev = PMU_NULL_DEV, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, @@ -11167,7 +11174,7 @@ static void task_clock_event_read(struct perf_event *event) static int task_clock_event_init(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_SOFTWARE) + if (event->attr.type != perf_task_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) @@ -11188,6 +11195,7 @@ static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, + .dev = PMU_NULL_DEV, .event_init = task_clock_event_init, .add = task_clock_event_add, @@ -11415,31 +11423,31 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto unlock; pmu->type = -1; - if (!name) - goto skip_type; + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { + ret = -EINVAL; + goto free_pdc; + } + pmu->name = name; - if (type != PERF_TYPE_SOFTWARE) { - if (type >= 0) - max = type; + if (type >= 0) + max = type; - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); - if (ret < 0) - goto free_pdc; + ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); + if (ret < 0) + goto free_pdc; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && ret != type); - type = ret; - } + type = ret; pmu->type = type; - if (pmu_bus_running) { + if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); if (ret) goto free_idr; } -skip_type: ret = -ENOMEM; pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); if (!pmu->cpu_pmu_context) @@ -11481,16 +11489,7 @@ skip_type: if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; - /* - * Ensure the TYPE_SOFTWARE PMUs are at the head of the list, - * since these cannot be in the IDR. This way the linear search - * is fast, provided a valid software event is provided. - */ - if (type == PERF_TYPE_SOFTWARE || !name) - list_add_rcu(&pmu->entry, &pmus); - else - list_add_tail_rcu(&pmu->entry, &pmus); - + list_add_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: @@ -11499,12 +11498,13 @@ unlock: return ret; free_dev: - device_del(pmu->dev); - put_device(pmu->dev); + if (pmu->dev && pmu->dev != PMU_NULL_DEV) { + device_del(pmu->dev); + put_device(pmu->dev); + } free_idr: - if (pmu->type != PERF_TYPE_SOFTWARE) - idr_remove(&pmu_idr, pmu->type); + idr_remove(&pmu_idr, pmu->type); free_pdc: free_percpu(pmu->pmu_disable_count); @@ -11525,9 +11525,8 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - if (pmu->type != PERF_TYPE_SOFTWARE) - idr_remove(&pmu_idr, pmu->type); - if (pmu_bus_running) { + idr_remove(&pmu_idr, pmu->type); + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); @@ -11601,6 +11600,12 @@ static struct pmu *perf_init_event(struct perf_event *event) idx = srcu_read_lock(&pmus_srcu); + /* + * Save original type before calling pmu->event_init() since certain + * pmus overwrites event->attr.type to forward event to another pmu. + */ + event->orig_type = event->attr.type; + /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; @@ -13640,8 +13645,8 @@ void __init perf_event_init(void) perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); - perf_pmu_register(&perf_cpu_clock, NULL, -1); - perf_pmu_register(&perf_task_clock, NULL, -1); + perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); + perf_pmu_register(&perf_task_clock, "task_clock", -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); @@ -13684,7 +13689,7 @@ static int __init perf_event_sysfs_init(void) goto unlock; list_for_each_entry(pmu, &pmus, entry) { - if (!pmu->name || pmu->type < 0) + if (pmu->dev) continue; ret = pmu_dev_alloc(pmu); -- cgit v1.2.3 From 9551fbb64d094cc105964716224adeb7765df8fd Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 4 May 2023 16:30:02 +0530 Subject: perf/core: Remove pmu linear searching code Searching for the right pmu by iterating over all pmus is no longer required since all pmus now *must* be present in the 'pmu_idr' list. So, remove linear searching code. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230504110003.2548-4-ravi.bangoria@amd.com --- kernel/events/core.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c01bbe93e291..231b187c1a3f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11630,38 +11630,27 @@ static struct pmu *perf_init_event(struct perf_event *event) } again: + ret = -ENOENT; rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); - if (pmu) { - if (event->attr.type != type && type != PERF_TYPE_RAW && - !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) - goto fail; - - ret = perf_try_init_event(pmu, event); - if (ret == -ENOENT && event->attr.type != type && !extended_type) { - type = event->attr.type; - goto again; - } + if (!pmu) + goto fail; - if (ret) - pmu = ERR_PTR(ret); + if (event->attr.type != type && type != PERF_TYPE_RAW && + !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) + goto fail; - goto unlock; + ret = perf_try_init_event(pmu, event); + if (ret == -ENOENT && event->attr.type != type && !extended_type) { + type = event->attr.type; + goto again; } - list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { - ret = perf_try_init_event(pmu, event); - if (!ret) - goto unlock; - - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } - } fail: - pmu = ERR_PTR(-ENOENT); + if (ret) + pmu = ERR_PTR(ret); + unlock: srcu_read_unlock(&pmus_srcu, idx); -- cgit v1.2.3 From 8b36d07f1d63de102d464f44a89704bc62d00811 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:37 -0700 Subject: sched/fair: Move is_core_idle() out of CONFIG_NUMA asym_packing needs this function to determine whether an SMT core is a suitable destination for load balancing. Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230406203148.19182-2-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 373ff5f55884..a47208dbb42a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +static inline bool is_core_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + int sibling; + + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; + + if (!idle_cpu(sibling)) + return false; + } +#endif + + return true; +} + #ifdef CONFIG_NUMA #define NUMA_IMBALANCE_MIN 2 @@ -1700,23 +1717,6 @@ struct numa_stats { int idle_cpu; }; -static inline bool is_core_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - int sibling; - - for_each_cpu(sibling, cpu_smt_mask(cpu)) { - if (cpu == sibling) - continue; - - if (!idle_cpu(sibling)) - return false; - } -#endif - - return true; -} - struct task_numa_env { struct task_struct *p; -- cgit v1.2.3 From eefefa716c9fa6aa73159f09954b7eeba4cafd09 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:38 -0700 Subject: sched/fair: Only do asym_packing load balancing from fully idle SMT cores When balancing load between cores, all the SMT siblings of the destination CPU, if any, must be idle. Otherwise, pulling new tasks degrades the throughput of the busy SMT siblings. The overall throughput of the system remains the same. When balancing load within an SMT core this consideration is not relevant. Follow the priorities that hardware indicates. Suggested-by: Valentin Schneider Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230406203148.19182-3-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 56 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a47208dbb42a..713d03e73978 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9330,6 +9330,25 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } +/** + * sched_use_asym_prio - Check whether asym_packing priority must be used + * @sd: The scheduling domain of the load balancing + * @cpu: A CPU + * + * Always use CPU priority when balancing load between SMT siblings. When + * balancing load between cores, it is not sufficient that @cpu is idle. Only + * use CPU priority if the whole core is idle. + * + * Returns: True if the priority of @cpu must be followed. False otherwise. + */ +static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) +{ + if (!sched_smt_active()) + return true; + + return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); +} + /** * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks * @dst_cpu: Destination CPU of the load balancing @@ -9340,6 +9359,9 @@ group_type group_classify(unsigned int imbalance_pct, * Check the state of the SMT siblings of both @sds::local and @sg and decide * if @dst_cpu can pull tasks. * + * This function must be called only if all the SMT siblings of @dst_cpu are + * idle, if any. + * * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks * only if @dst_cpu has higher priority. @@ -9349,8 +9371,7 @@ group_type group_classify(unsigned int imbalance_pct, * Bigger imbalances in the number of busy CPUs will be dealt with in * update_sd_pick_busiest(). * - * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings - * of @dst_cpu are idle and @sg has lower priority. + * If @sg does not have SMT siblings, only pull tasks if @sg has lower priority. * * Return: true if @dst_cpu can pull tasks, false otherwise. */ @@ -9398,15 +9419,8 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, return false; } - /* - * @sg does not have SMT siblings. Ensure that @sds::local does not end - * up with more than one busy SMT sibling and only pull tasks if there - * are not busy CPUs (i.e., no CPU has running tasks). - */ - if (!sds->local_stat.sum_nr_running) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - - return false; + /* If we are here @dst_cpu has SMT siblings and are also idle. */ + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); #else /* Always return false so that callers deal with non-SMT cases. */ return false; @@ -9417,7 +9431,11 @@ static inline bool sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, struct sched_group *group) { - /* Only do SMT checks if either local or candidate have SMT siblings */ + /* Ensure that the whole local core is idle, if applicable. */ + if (!sched_use_asym_prio(env->sd, env->dst_cpu)) + return false; + + /* Only do SMT checks if either local or candidate have SMT siblings. */ if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || (group->flags & SD_SHARE_CPUCAPACITY)) return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); @@ -10632,11 +10650,13 @@ static inline bool asym_active_balance(struct lb_env *env) { /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. + * ASYM_PACKING needs to force migrate tasks from busy but lower + * priority CPUs in order to pack all tasks in the highest priority + * CPUs. When done between cores, do it only if the whole core if the + * whole core is idle. */ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && + sched_use_asym_prio(env->sd, env->dst_cpu) && sched_asym_prefer(env->dst_cpu, env->src_cpu); } @@ -11371,9 +11391,13 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_PACKING; see if there's a more preferred CPU * currently idle; in which case, kick the ILB to move tasks * around. + * + * When balancing betwen cores, all the SMT siblings of the + * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_asym_prefer(i, cpu)) { + if (sched_use_asym_prio(sd, i) && + sched_asym_prefer(i, cpu)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } -- cgit v1.2.3 From ef7657d4d2d6a8456aa624010de456c32a135fe9 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:39 -0700 Subject: sched/fair: Simplify asym_packing logic for SMT cores Callers of asym_smt_can_pull_tasks() check the idle state of the destination CPU and its SMT siblings, if any. No extra checks are needed in such function. Since SMT cores divide capacity among its siblings, priorities only really make sense if only one sibling is active. This is true for SMT2, SMT4, SMT8, etc. Do not use asym_packing load balance for this case. Instead, let find_busiest_group() handle imbalances. When balancing non-SMT cores or at higher scheduling domains (e.g., between MC scheduling groups), continue using priorities. Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Len Brown Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230406203148.19182-4-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 713d03e73978..a8a02ae7d082 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9366,12 +9366,9 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks * only if @dst_cpu has higher priority. * - * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more - * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. - * Bigger imbalances in the number of busy CPUs will be dealt with in - * update_sd_pick_busiest(). - * - * If @sg does not have SMT siblings, only pull tasks if @sg has lower priority. + * When dealing with SMT cores, only use priorities if the SMT core has exactly + * one busy sibling. find_busiest_group() will handle bigger imbalances in the + * number of busy CPUs. * * Return: true if @dst_cpu can pull tasks, false otherwise. */ @@ -9380,12 +9377,10 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sched_group *sg) { #ifdef CONFIG_SCHED_SMT - bool local_is_smt, sg_is_smt; + bool local_is_smt; int sg_busy_cpus; local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; - sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; - sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; if (!local_is_smt) { @@ -9406,21 +9401,17 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); } - /* @dst_cpu has SMT siblings. */ - - if (sg_is_smt) { - int local_busy_cpus = sds->local->group_weight - - sds->local_stat.idle_cpus; - int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; - - if (busy_cpus_delta == 1) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - + /* + * If we are here @dst_cpu has SMT siblings and are also idle. + * + * CPU priorities does not make sense for SMT cores with more than one + * busy sibling. + */ + if (group->flags & SD_SHARE_CPUCAPACITY && sg_busy_cpus != 1) return false; - } - /* If we are here @dst_cpu has SMT siblings and are also idle. */ return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + #else /* Always return false so that callers deal with non-SMT cases. */ return false; -- cgit v1.2.3 From 18ad34532755feb5b9f4284b07769b1bfec18ab3 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:40 -0700 Subject: sched/fair: Let low-priority cores help high-priority busy SMT cores Using asym_packing priorities within an SMT core is straightforward. Just follow the priorities that hardware indicates. When balancing load from an SMT core, also consider the idle state of its siblings. Priorities do not reflect that an SMT core divides its throughput among all its busy siblings. They only makes sense when exactly one sibling is busy. Indicate that active balance is needed if the destination CPU has lower priority than the source CPU but the latter has busy SMT siblings. Make find_busiest_queue() not skip higher-priority SMT cores with more than busy sibling. Suggested-by: Valentin Schneider Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230406203148.19182-5-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a8a02ae7d082..85ce2494234d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10551,8 +10551,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, nr_running == 1) continue; - /* Make sure we only pull tasks from a CPU of lower priority */ + /* + * Make sure we only pull tasks from a CPU of lower priority + * when balancing between SMT siblings. + * + * If balancing between cores, let lower priority CPUs help + * SMT cores with more than one busy sibling. + */ if ((env->sd->flags & SD_ASYM_PACKING) && + sched_use_asym_prio(env->sd, i) && sched_asym_prefer(i, env->dst_cpu) && nr_running == 1) continue; @@ -10645,10 +10652,15 @@ asym_active_balance(struct lb_env *env) * priority CPUs in order to pack all tasks in the highest priority * CPUs. When done between cores, do it only if the whole core if the * whole core is idle. + * + * If @env::src_cpu is an SMT core with busy siblings, let + * the lower priority @env::dst_cpu help it. Do not follow + * CPU priority. */ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && sched_use_asym_prio(env->sd, env->dst_cpu) && - sched_asym_prefer(env->dst_cpu, env->src_cpu); + (sched_asym_prefer(env->dst_cpu, env->src_cpu) || + !sched_use_asym_prio(env->sd, env->src_cpu)); } static inline bool -- cgit v1.2.3 From 5fd6d7f43958cb62da105c8413eac3e78480f09a Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:41 -0700 Subject: sched/fair: Keep a fully_busy SMT sched group as busiest When comparing two fully_busy scheduling groups, keep the current busiest group if it represents an SMT core. Tasks in such scheduling group share CPU resources and need more help than tasks in a non-SMT fully_busy group. Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230406203148.19182-6-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 85ce2494234d..4a9f04095742 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9619,10 +9619,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, * contention when accessing shared HW resources. * * XXX for now avg_load is not computed and always 0 so we - * select the 1st one. + * select the 1st one, except if @sg is composed of SMT + * siblings. */ - if (sgs->avg_load <= busiest->avg_load) + + if (sgs->avg_load < busiest->avg_load) return false; + + if (sgs->avg_load == busiest->avg_load) { + /* + * SMT sched groups need more help than non-SMT groups. + * If @sg happens to also be SMT, either choice is good. + */ + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) + return false; + } + break; case group_has_spare: -- cgit v1.2.3 From 43726bdedd29797d8e1fee2e7300a6d2b9a74ba8 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:42 -0700 Subject: sched/fair: Use the busiest group to set prefer_sibling The prefer_sibling setting acts on the busiest group to move excess tasks to the local group. This should be done as per request of the child of the busiest group's sched domain, not the local group's. Using the flags of the child domain of the local group works fortuitously if both groups have child domains. There are cases, however, in which the busiest group's sched domain has child but the local group's does not. Consider, for instance a non-SMT core (or an SMT core with only one online sibling) doing load balance with an SMT core at the MC level. SD_PREFER_SIBLING of the busiest group's child domain will not be honored. We are left with a fully busy SMT core and an idle non-SMT core. Suggested-by: Dietmar Eggemann Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230406203148.19182-7-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4a9f04095742..3bb89346dbb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10109,7 +10109,6 @@ static void update_idle_cpu_scan(struct lb_env *env, static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; @@ -10150,8 +10149,13 @@ next_group: sg = sg->next; } while (sg != env->sd->groups); - /* Tag domain that child domain prefers tasks go to siblings first */ - sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + /* + * Indicate that the child domain of the busiest group prefers tasks + * go to a child's sibling domains first. NB the flags of a sched group + * are those of the child domain. + */ + if (sds->busiest) + sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING); if (env->sd->flags & SD_NUMA) @@ -10461,7 +10465,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto out_balanced; } - /* Try to move all excess tasks to child's sibling domain */ + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. + */ if (sds.prefer_sibling && local->group_type == group_has_spare && busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; -- cgit v1.2.3 From c9ca07886aaa40225a29e5c1e46ac31d2e14f53a Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:43 -0700 Subject: sched/fair: Do not even the number of busy CPUs via asym_packing Now that find_busiest_group() triggers load balancing between a fully_ busy SMT2 core and an idle non-SMT core, it is no longer needed to force balancing via asym_packing. Use asym_packing only as intended: when there is high-priority CPU that is idle. After this change, the same logic apply to SMT and non-SMT local groups. It makes less sense having a separate function to deal specifically with SMT. Fold the logic in asym_smt_can_pull_tasks() into sched_asym(). Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230406203148.19182-8-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 86 +++++++++++++---------------------------------------- 1 file changed, 21 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bb89346dbb3..48b6f0ca13ac 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9350,74 +9350,26 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) } /** - * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks - * @dst_cpu: Destination CPU of the load balancing + * sched_asym - Check if the destination CPU can do asym_packing load balance + * @env: The load balancing environment * @sds: Load-balancing data with statistics of the local group * @sgs: Load-balancing statistics of the candidate busiest group - * @sg: The candidate busiest group + * @group: The candidate busiest group * - * Check the state of the SMT siblings of both @sds::local and @sg and decide - * if @dst_cpu can pull tasks. + * @env::dst_cpu can do asym_packing if it has higher priority than the + * preferred CPU of @group. * - * This function must be called only if all the SMT siblings of @dst_cpu are - * idle, if any. + * SMT is a special case. If we are balancing load between cores, @env::dst_cpu + * can do asym_packing balance only if all its SMT siblings are idle. Also, it + * can only do it if @group is an SMT group and has exactly on busy CPU. Larger + * imbalances in the number of CPUS are dealt with in find_busiest_group(). * - * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of - * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks - * only if @dst_cpu has higher priority. + * If we are balancing load within an SMT core, or at DIE domain level, always + * proceed. * - * When dealing with SMT cores, only use priorities if the SMT core has exactly - * one busy sibling. find_busiest_group() will handle bigger imbalances in the - * number of busy CPUs. - * - * Return: true if @dst_cpu can pull tasks, false otherwise. + * Return: true if @env::dst_cpu can do with asym_packing load balance. False + * otherwise. */ -static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, - struct sg_lb_stats *sgs, - struct sched_group *sg) -{ -#ifdef CONFIG_SCHED_SMT - bool local_is_smt; - int sg_busy_cpus; - - local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; - sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; - - if (!local_is_smt) { - /* - * If we are here, @dst_cpu is idle and does not have SMT - * siblings. Pull tasks if candidate group has two or more - * busy CPUs. - */ - if (sg_busy_cpus >= 2) /* implies sg_is_smt */ - return true; - - /* - * @dst_cpu does not have SMT siblings. @sg may have SMT - * siblings and only one is busy. In such case, @dst_cpu - * can help if it has higher priority and is idle (i.e., - * it has no running tasks). - */ - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - } - - /* - * If we are here @dst_cpu has SMT siblings and are also idle. - * - * CPU priorities does not make sense for SMT cores with more than one - * busy sibling. - */ - if (group->flags & SD_SHARE_CPUCAPACITY && sg_busy_cpus != 1) - return false; - - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - -#else - /* Always return false so that callers deal with non-SMT cases. */ - return false; -#endif -} - static inline bool sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, struct sched_group *group) @@ -9426,10 +9378,14 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs if (!sched_use_asym_prio(env->sd, env->dst_cpu)) return false; - /* Only do SMT checks if either local or candidate have SMT siblings. */ - if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || - (group->flags & SD_SHARE_CPUCAPACITY)) - return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); + /* + * CPU priorities does not make sense for SMT cores with more than one + * busy sibling. + */ + if (group->flags & SD_SHARE_CPUCAPACITY) { + if (sgs->group_weight - sgs->idle_cpus != 1) + return false; + } return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); } -- cgit v1.2.3 From 40b4d3dc328265c8ec6688657d74813edf785c83 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:44 -0700 Subject: sched/topology: Check SDF_SHARED_CHILD in highest_flag_domain() Do not assume that all the children of a scheduling domain have a given flag. Check whether it has the SDF_SHARED_CHILD meta flag. Suggested-by: Ionela Voinescu Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230406203148.19182-9-ricardo.neri-calderon@linux.intel.com --- kernel/sched/sched.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7b3e0a2b20..678446251c35 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1772,6 +1772,13 @@ queue_balance_callback(struct rq *rq, for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) +/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) | +static const unsigned int SD_SHARED_CHILD_MASK = +#include +0; +#undef SD_FLAG + /** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to @@ -1779,16 +1786,25 @@ queue_balance_callback(struct rq *rq, * @flag: The flag to check for the highest sched_domain * for the given CPU. * - * Returns the highest sched_domain of a CPU which contains the given flag. + * Returns the highest sched_domain of a CPU which contains @flag. If @flag has + * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag. */ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) { struct sched_domain *sd, *hsd = NULL; for_each_domain(cpu, sd) { - if (!(sd->flags & flag)) + if (sd->flags & flag) { + hsd = sd; + continue; + } + + /* + * Stop the search if @flag is known to be shared at lower + * levels. It will not be found further up. + */ + if (flag & SD_SHARED_CHILD_MASK) break; - hsd = sd; } return hsd; -- cgit v1.2.3 From 519fabc7aaba3f0847cf37d5f9a5740c370eb777 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Mar 2023 17:13:46 -0800 Subject: psi: remove 500ms min window size limitation for triggers Current 500ms min window size for psi triggers limits polling interval to 50ms to prevent polling threads from using too much cpu bandwidth by polling too frequently. However the number of cgroups with triggers is unlimited, so this protection can be defeated by creating multiple cgroups with psi triggers (triggers in each cgroup are served by a single "psimon" kernel thread). Instead of limiting min polling period, which also limits the latency of psi events, it's better to limit psi trigger creation to authorized users only, like we do for system-wide psi triggers (/proc/pressure/* files can be written only by processes with CAP_SYS_RESOURCE capability). This also makes access rules for cgroup psi files consistent with system-wide ones. Add a CAP_SYS_RESOURCE capability check for cgroup psi file writers and remove the psi window min size limitation. Suggested-by: Sudarshan Rajagopalan Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michal Hocko Acked-by: Johannes Weiner Link: https://lore.kernel.org/all/cover.1676067791.git.quic_sudaraja@quicinc.com/ --- kernel/cgroup/cgroup.c | 12 ++++++++++++ kernel/sched/psi.c | 4 +--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 625d7483951c..b26ae200abef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3877,6 +3877,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } +static int cgroup_pressure_open(struct kernfs_open_file *of) +{ + if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return 0; +} + static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; @@ -5276,6 +5284,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), + .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5284,6 +5293,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), + .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5292,6 +5302,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), + .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -5301,6 +5312,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e072f6b31bf3..b49af594ad28 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -160,7 +160,6 @@ __setup("psi=", setup_psi); #define EXP_300s 2034 /* 1/exp(2s/300s) */ /* PSI trigger definitions */ -#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ #define WINDOW_MAX_US 10000000 /* Max window size is 10s */ #define UPDATES_PER_WINDOW 10 /* 10 updates per window */ @@ -1305,8 +1304,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); - if (window_us < WINDOW_MIN_US || - window_us > WINDOW_MAX_US) + if (window_us == 0 || window_us > WINDOW_MAX_US) return ERR_PTR(-EINVAL); /* -- cgit v1.2.3 From bf2dc42d6beb890c995b8b09f881ef1b37259107 Mon Sep 17 00:00:00 2001 From: Tim C Chen Date: Thu, 4 May 2023 09:09:51 -0700 Subject: sched/topology: Propagate SMT flags when removing degenerate domain When a degenerate cluster domain for core with SMT CPUs is removed, the SD_SHARE_CPUCAPACITY flag in the local child sched group was not propagated to the new parent. We need this flag to properly determine whether the local sched group is SMT. Set the flag in the local child sched group of the new parent sched domain. Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ricardo Neri Link: https://lkml.kernel.org/r/73cf0959eafa53c02e7ef6bf805d751d9190e55d.1683156492.git.tim.c.chen@linux.intel.com --- kernel/sched/topology.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6682535e37c8..ca4472281c28 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; - if (parent->parent) + + if (parent->parent) { parent->parent->child = tmp; + if (tmp->flags & SD_SHARE_CPUCAPACITY) + parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY; + } + /* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this -- cgit v1.2.3 From a6fcdd8d95f7486150b3faadfea119fc3dfc3b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=8F=E8=89=B3=28=E9=87=87=E8=8B=93=29?= Date: Sat, 6 May 2023 15:42:53 +0800 Subject: sched/debug: Correct printing for rq->nr_uninterruptible Commit e6fe3f422be1 ("sched: Make multiple runqueue task counters 32-bit") changed the type for rq->nr_uninterruptible from "unsigned long" to "unsigned int", but left wrong cast print to /sys/kernel/debug/sched/debug and to the console. For example, nr_uninterruptible's value is fffffff7 with type "unsigned int", (long)nr_uninterruptible shows 4294967287 while (int)nr_uninterruptible prints -9. So using int cast fixes wrong printing. Signed-off-by: Yan Yan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230506074253.44526-1-yanyan.yan@antgroup.com --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0b2340a79b65..066ff1c8ae4e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu) #define P(x) \ do { \ if (sizeof(rq->x) == 4) \ - SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ + SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \ else \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ } while (0) -- cgit v1.2.3 From ad3a557daf6915296a43ef97a3e9c48e076c9dd8 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 May 2023 09:58:49 +0200 Subject: cgroup/cpuset: Rename functions dealing with DEADLINE accounting rebuild_root_domains() and update_tasks_root_domain() have neutral names, but actually deal with DEADLINE bandwidth accounting. Rename them to use 'dl_' prefix so that intent is more clear. No functional change. Suggested-by: Qais Yousef Signed-off-by: Juri Lelli Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e4ca2dd2b764..428ab46291e2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1066,7 +1066,7 @@ done: return ndoms; } -static void update_tasks_root_domain(struct cpuset *cs) +static void dl_update_tasks_root_domain(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; @@ -1079,7 +1079,7 @@ static void update_tasks_root_domain(struct cpuset *cs) css_task_iter_end(&it); } -static void rebuild_root_domains(void) +static void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; @@ -1107,7 +1107,7 @@ static void rebuild_root_domains(void) rcu_read_unlock(); - update_tasks_root_domain(cs); + dl_update_tasks_root_domain(cs); rcu_read_lock(); css_put(&cs->css); @@ -1121,7 +1121,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], { mutex_lock(&sched_domains_mutex); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - rebuild_root_domains(); + dl_rebuild_rd_accounting(); mutex_unlock(&sched_domains_mutex); } -- cgit v1.2.3 From 111cd11bbc54850f24191c52ff217da88a5e639b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 May 2023 09:58:50 +0200 Subject: sched/cpuset: Bring back cpuset_mutex Turns out percpu_cpuset_rwsem - commit 1243dc518c9d ("cgroup/cpuset: Convert cpuset_mutex to percpu_rwsem") - wasn't such a brilliant idea, as it has been reported to cause slowdowns in workloads that need to change cpuset configuration frequently and it is also not implementing priority inheritance (which causes troubles with realtime workloads). Convert percpu_cpuset_rwsem back to regular cpuset_mutex. Also grab it only for SCHED_DEADLINE tasks (other policies don't care about stable cpusets anyway). Signed-off-by: Juri Lelli Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 159 +++++++++++++++++++++++++------------------------ kernel/sched/core.c | 22 ++++--- 2 files changed, 95 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 428ab46291e2..041c0809adaf 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -366,22 +366,23 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global locks guarding cpuset structures - cpuset_rwsem and + * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. The cpuset code uses only cpuset_rwsem write lock. Other - * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to - * prevent change to cpuset structures. + * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems + * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset + * structures. Note that cpuset_mutex needs to be a mutex as it is used in + * paths that rely on priority inheritance (e.g. scheduler - on RT) for + * correctness. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it - * is the only task able to also acquire callback_lock and be able to - * modify cpusets. It can perform various checks on the cpuset structure - * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_rwsem. While it is performing these checks, various - * callback routines can briefly acquire callback_lock to query cpusets. - * Once it is ready to make the changes, it takes callback_lock, blocking - * everyone else. + * cpuset_mutex, it blocks others, ensuring that it is the only task able to + * also acquire callback_lock and be able to modify cpusets. It can perform + * various checks on the cpuset structure first, knowing nothing will change. + * It can also allocate memory while just holding cpuset_mutex. While it is + * performing these checks, various callback routines can briefly acquire + * callback_lock to query cpusets. Once it is ready to make the changes, it + * takes callback_lock, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_lock, as that would risk double tripping on callback_lock @@ -403,16 +404,16 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ -DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); +static DEFINE_MUTEX(cpuset_mutex); -void cpuset_read_lock(void) +void cpuset_lock(void) { - percpu_down_read(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); } -void cpuset_read_unlock(void) +void cpuset_unlock(void) { - percpu_up_read(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } static DEFINE_SPINLOCK(callback_lock); @@ -496,7 +497,7 @@ static inline bool partition_is_populated(struct cpuset *cs, * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) @@ -538,7 +539,7 @@ out_unlock: * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -550,7 +551,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_rwsem held. The check can be skipped + * Call with callback_lock or cpuset_mutex held. The check can be skipped * if on default hierarchy. */ static void cpuset_update_task_spread_flags(struct cpuset *cs, @@ -575,7 +576,7 @@ static void cpuset_update_task_spread_flags(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_rwsem. + * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -713,7 +714,7 @@ out: * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_rwsem held. + * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -829,7 +830,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } -/* Must be called with cpuset_rwsem held. */ +/* Must be called with cpuset_mutex held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ @@ -855,7 +856,7 @@ static inline int nr_cpusets(void) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cpuset_rwsem held. + * Must be called with cpuset_mutex held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a @@ -1084,7 +1085,7 @@ static void dl_rebuild_rd_accounting(void) struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -1134,7 +1135,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_rwsem held. Takes cpus_read_lock(). + * Call with cpuset_mutex held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1145,7 +1146,7 @@ static void rebuild_sched_domains_locked(void) int ndoms; lockdep_assert_cpus_held(); - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * If we have raced with CPU hotplug, return early to avoid @@ -1196,9 +1197,9 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); } @@ -1208,7 +1209,7 @@ void rebuild_sched_domains(void) * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the - * effective cpuset's. As this function is called with cpuset_rwsem held, + * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. @@ -1322,7 +1323,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * The parent must be a partition root. @@ -1545,7 +1546,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_rwsem held + * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, bool force) @@ -1705,7 +1706,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * Check all its siblings and call update_cpumasks_hier() @@ -1955,12 +1956,12 @@ static void *cpuset_being_rebound; * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the - * effective cpuset's. As this function is called with cpuset_rwsem held, + * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { - static nodemask_t newmems; /* protected by cpuset_rwsem */ + static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; struct task_struct *task; @@ -1973,7 +1974,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cpuset_rwsem, we know that no other rebind effort + * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -2019,7 +2020,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * - * Called with cpuset_rwsem held + * Called with cpuset_mutex held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { @@ -2072,7 +2073,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_rwsem held. May take callback_lock during call. + * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -2164,7 +2165,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_rwsem held, cpuset membership stays + * function is called with cpuset_mutex held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) @@ -2184,7 +2185,7 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_rwsem held. + * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -2234,7 +2235,7 @@ out: * @new_prs: new partition root state * Return: 0 if successful, != 0 if error * - * Call with cpuset_rwsem held. + * Call with cpuset_mutex held. */ static int update_prstate(struct cpuset *cs, int new_prs) { @@ -2472,7 +2473,7 @@ static int cpuset_can_attach_check(struct cpuset *cs) return 0; } -/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; @@ -2484,7 +2485,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); @@ -2506,7 +2507,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) */ cs->attach_in_progress++; out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); return ret; } @@ -2518,15 +2519,15 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -2535,7 +2536,7 @@ static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); if (cs != &top_cpuset) guarantee_online_cpus(task, cpus_attach); @@ -2565,7 +2566,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); @@ -2626,7 +2627,7 @@ out: if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -2658,7 +2659,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, int retval = 0; cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -2694,7 +2695,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return retval; } @@ -2707,7 +2708,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, int retval = -ENODEV; cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2720,7 +2721,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return retval; } @@ -2753,7 +2754,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after - * grabbing cpuset_rwsem anyway. This only happens on the legacy + * grabbing cpuset_mutex anyway. This only happens on the legacy * hierarchies. */ css_get(&cs->css); @@ -2761,7 +2762,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, flush_work(&cpuset_hotplug_work); cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2785,7 +2786,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); @@ -2933,13 +2934,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, css_get(&cs->css); cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; retval = update_prstate(cs, val); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); css_put(&cs->css); return retval ?: nbytes; @@ -3156,7 +3157,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) return 0; cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) @@ -3207,7 +3208,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return 0; } @@ -3228,7 +3229,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) struct cpuset *cs = css_cs(css); cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (is_partition_valid(cs)) update_prstate(cs, 0); @@ -3247,7 +3248,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); } @@ -3260,7 +3261,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { @@ -3273,7 +3274,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } spin_unlock_irq(&callback_lock); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* @@ -3294,7 +3295,7 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) return 0; lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); @@ -3315,7 +3316,7 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) */ cs->attach_in_progress++; out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); return ret; } @@ -3331,11 +3332,11 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) if (same_cs) return; - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* @@ -3363,7 +3364,7 @@ static void cpuset_fork(struct task_struct *task) } /* CLONE_INTO_CGROUP */ - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); @@ -3371,7 +3372,7 @@ static void cpuset_fork(struct task_struct *task) if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } struct cgroup_subsys cpuset_cgrp_subsys = { @@ -3472,7 +3473,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* * Move tasks to the nearest ancestor with execution resources, @@ -3482,7 +3483,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (is_empty) remove_tasks_in_empty_cpuset(cs); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); } static void @@ -3533,14 +3534,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); goto retry; } @@ -3637,7 +3638,7 @@ update_tasks: cpus_updated, mems_updated); unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /** @@ -3667,7 +3668,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -3724,7 +3725,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset); } - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { @@ -4155,7 +4156,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc//cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_rwsem, keeping cpuset_attach() from changing it + * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 944c3ae39861..d826bec1c522 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7590,6 +7590,7 @@ static int __sched_setscheduler(struct task_struct *p, int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; + bool cpuset_locked = false; /* The pi code expects interrupts enabled */ BUG_ON(pi && in_interrupt()); @@ -7639,8 +7640,14 @@ recheck: return retval; } - if (pi) - cpuset_read_lock(); + /* + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets + * information. + */ + if (dl_policy(policy) || dl_policy(p->policy)) { + cpuset_locked = true; + cpuset_lock(); + } /* * Make sure no PI-waiters arrive (or leave) while we are @@ -7716,8 +7723,8 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); goto recheck; } @@ -7784,7 +7791,8 @@ change: task_rq_unlock(rq, p, &rf); if (pi) { - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); rt_mutex_adjust_pi(p); } @@ -7796,8 +7804,8 @@ change: unlock: task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); return retval; } -- cgit v1.2.3 From 6c24849f5515e4966d94fa5279bdff4acf2e9489 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 May 2023 09:58:51 +0200 Subject: sched/cpuset: Keep track of SCHED_DEADLINE task in cpusets Qais reported that iterating over all tasks when rebuilding root domains for finding out which ones are DEADLINE and need their bandwidth correctly restored on such root domains can be a costly operation (10+ ms delays on suspend-resume). To fix the problem keep track of the number of DEADLINE tasks belonging to each cpuset and then use this information (followup patch) to only perform the above iteration if DEADLINE tasks are actually present in the cpuset for which a corresponding root domain is being rebuilt. Reported-by: Qais Yousef Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ Signed-off-by: Juri Lelli Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++++ kernel/cgroup/cpuset.c | 25 +++++++++++++++++++++++++ kernel/sched/deadline.c | 14 ++++++++++++++ 3 files changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 625d7483951c..9d809191a54f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -6683,6 +6684,9 @@ void cgroup_exit(struct task_struct *tsk) list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + if (dl_task(tsk)) + dec_dl_tasks_cs(tsk); + WARN_ON_ONCE(cgroup_task_frozen(tsk)); if (unlikely(!(tsk->flags & PF_KTHREAD) && test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 041c0809adaf..ca195ff8b298 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -193,6 +193,12 @@ struct cpuset { int use_parent_ecpus; int child_ecpus_count; + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; @@ -245,6 +251,20 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) return css_cs(cs->css.parent); } +void inc_dl_tasks_cs(struct task_struct *p) +{ + struct cpuset *cs = task_cs(p); + + cs->nr_deadline_tasks++; +} + +void dec_dl_tasks_cs(struct task_struct *p) +{ + struct cpuset *cs = task_cs(p); + + cs->nr_deadline_tasks--; +} + /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, @@ -2499,6 +2519,11 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) ret = security_task_setscheduler(task); if (ret) goto out_unlock; + + if (dl_task(task)) { + cs->nr_deadline_tasks++; + cpuset_attach_old_cs->nr_deadline_tasks--; + } } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a9a4b81c972..e11de074a6fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,8 @@ * Fabio Checconi */ +#include + /* * Default limits for DL period; on the top end we guard against small util * tasks still getting ridiculously long effective runtimes, on the bottom end we @@ -2596,6 +2598,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); + /* + * In case a task is setscheduled out from SCHED_DEADLINE we need to + * keep track of that on its cpuset (for correct bandwidth tracking). + */ + dec_dl_tasks_cs(p); + if (!task_on_rq_queued(p)) { /* * Inactive timer is armed. However, p is leaving DEADLINE and @@ -2636,6 +2644,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); + /* + * In case a task is setscheduled to SCHED_DEADLINE we need to keep + * track of that on its cpuset (for correct bandwidth tracking). + */ + inc_dl_tasks_cs(p); + /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { add_rq_bw(&p->dl, &rq->dl); -- cgit v1.2.3 From c0f78fd5edcf29b2822ac165f9248a6c165e8554 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 May 2023 09:58:52 +0200 Subject: cgroup/cpuset: Iterate only if DEADLINE tasks are present update_tasks_root_domain currently iterates over all tasks even if no DEADLINE task is present on the cpuset/root domain for which bandwidth accounting is being rebuilt. This has been reported to introduce 10+ ms delays on suspend-resume operations. Skip the costly iteration for cpusets that don't contain DEADLINE tasks. Reported-by: Qais Yousef Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ Signed-off-by: Juri Lelli Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca195ff8b298..b7168970fff2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1092,6 +1092,9 @@ static void dl_update_tasks_root_domain(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; + if (cs->nr_deadline_tasks == 0) + return; + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) -- cgit v1.2.3 From 85989106feb734437e2d598b639991b9185a43a6 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 8 May 2023 09:58:53 +0200 Subject: sched/deadline: Create DL BW alloc, free & check overflow interface While moving a set of tasks between exclusive cpusets, cpuset_can_attach() -> task_can_attach() calls dl_cpu_busy(..., p) for DL BW overflow checking and per-task DL BW allocation on the destination root_domain for the DL tasks in this set. This approach has the issue of not freeing already allocated DL BW in the following error cases: (1) The set of tasks includes multiple DL tasks and DL BW overflow checking fails for one of the subsequent DL tasks. (2) Another controller next to the cpuset controller which is attached to the same cgroup fails in its can_attach(). To address this problem rework dl_cpu_busy(): (1) Split it into dl_bw_check_overflow() & dl_bw_alloc() and add a dedicated dl_bw_free(). (2) dl_bw_alloc() & dl_bw_free() take a `u64 dl_bw` parameter instead of a `struct task_struct *p` used in dl_cpu_busy(). This allows to allocate DL BW for a set of tasks too rather than only for a single task. Signed-off-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Tejun Heo --- kernel/sched/core.c | 4 ++-- kernel/sched/deadline.c | 53 +++++++++++++++++++++++++++++++++++++------------ kernel/sched/sched.h | 2 +- 3 files changed, 43 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d826bec1c522..df659892d7d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9319,7 +9319,7 @@ int task_can_attach(struct task_struct *p, if (unlikely(cpu >= nr_cpu_ids)) return -EINVAL; - ret = dl_cpu_busy(cpu, p); + ret = dl_bw_alloc(cpu, p->dl.dl_bw); } out: @@ -9604,7 +9604,7 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - int ret = dl_cpu_busy(cpu, NULL); + int ret = dl_bw_check_overflow(cpu); if (ret) return ret; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e11de074a6fd..166c3e6eae61 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3058,26 +3058,38 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -int dl_cpu_busy(int cpu, struct task_struct *p) +enum dl_bw_request { + dl_bw_req_check_overflow = 0, + dl_bw_req_alloc, + dl_bw_req_free +}; + +static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags, cap; + unsigned long flags; struct dl_bw *dl_b; - bool overflow; + bool overflow = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cap = dl_bw_capacity(cpu); - overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); - if (!overflow && p) { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); + if (req == dl_bw_req_free) { + __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); + } else { + unsigned long cap = dl_bw_capacity(cpu); + + overflow = __dl_overflow(dl_b, cap, 0, dl_bw); + + if (req == dl_bw_req_alloc && !overflow) { + /* + * We reserve space in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); + } } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -3085,6 +3097,21 @@ int dl_cpu_busy(int cpu, struct task_struct *p) return overflow ? -EBUSY : 0; } + +int dl_bw_check_overflow(int cpu) +{ + return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); +} + +int dl_bw_alloc(int cpu, u64 dl_bw) +{ + return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw); +} + +void dl_bw_free(int cpu, u64 dl_bw) +{ + dl_bw_manage(dl_bw_req_free, cpu, dl_bw); +} #endif #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7b3e0a2b20..0ad712811e35 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -330,7 +330,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_cpu_busy(int cpu, struct task_struct *p); +extern int dl_bw_check_overflow(int cpu); #ifdef CONFIG_CGROUP_SCHED -- cgit v1.2.3 From 2ef269ef1ac006acf974793d975539244d77b28f Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 8 May 2023 09:58:54 +0200 Subject: cgroup/cpuset: Free DL BW in case can_attach() fails cpuset_can_attach() can fail. Postpone DL BW allocation until all tasks have been checked. DL BW is not allocated per-task but as a sum over all DL tasks migrating. If multiple controllers are attached to the cgroup next to the cpuset controller a non-cpuset can_attach() can fail. In this case free DL BW in cpuset_cancel_attach(). Finally, update cpuset DL task count (nr_deadline_tasks) only in cpuset_attach(). Suggested-by: Waiman Long Signed-off-by: Dietmar Eggemann Signed-off-by: Juri Lelli Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 +++++++++++++++++++++++++++++++++++++++++++++----- kernel/sched/core.c | 17 ++-------------- 2 files changed, 50 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b7168970fff2..2c76fcd9f0bc 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -198,6 +198,8 @@ struct cpuset { * know when to rebuild associated root domain bandwidth information. */ int nr_deadline_tasks; + int nr_migrate_dl_tasks; + u64 sum_migrate_dl_bw; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; @@ -2496,16 +2498,23 @@ static int cpuset_can_attach_check(struct cpuset *cs) return 0; } +static void reset_migrate_dl_data(struct cpuset *cs) +{ + cs->nr_migrate_dl_tasks = 0; + cs->sum_migrate_dl_bw = 0; +} + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; - struct cpuset *cs; + struct cpuset *cs, *oldcs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + oldcs = cpuset_attach_old_cs; cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -2516,7 +2525,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { - ret = task_can_attach(task, cs->effective_cpus); + ret = task_can_attach(task); if (ret) goto out_unlock; ret = security_task_setscheduler(task); @@ -2524,11 +2533,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) goto out_unlock; if (dl_task(task)) { - cs->nr_deadline_tasks++; - cpuset_attach_old_cs->nr_deadline_tasks--; + cs->nr_migrate_dl_tasks++; + cs->sum_migrate_dl_bw += task->dl.dl_bw; } } + if (!cs->nr_migrate_dl_tasks) + goto out_success; + + if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + + if (unlikely(cpu >= nr_cpu_ids)) { + reset_migrate_dl_data(cs); + ret = -EINVAL; + goto out_unlock; + } + + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) { + reset_migrate_dl_data(cs); + goto out_unlock; + } + } + +out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. @@ -2551,6 +2580,14 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); + + if (cs->nr_migrate_dl_tasks) { + int cpu = cpumask_any(cs->effective_cpus); + + dl_bw_free(cpu, cs->sum_migrate_dl_bw); + reset_migrate_dl_data(cs); + } + mutex_unlock(&cpuset_mutex); } @@ -2651,6 +2688,12 @@ static void cpuset_attach(struct cgroup_taskset *tset) out: cs->old_mems_allowed = cpuset_attach_nodemask_to; + if (cs->nr_migrate_dl_tasks) { + cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; + oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; + reset_migrate_dl_data(cs); + } + cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); @@ -3330,7 +3373,7 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) if (ret) goto out_unlock; - ret = task_can_attach(task, cs->effective_cpus); + ret = task_can_attach(task); if (ret) goto out_unlock; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df659892d7d5..ed0d7381b2ec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9294,8 +9294,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -int task_can_attach(struct task_struct *p, - const struct cpumask *cs_effective_cpus) +int task_can_attach(struct task_struct *p) { int ret = 0; @@ -9308,21 +9307,9 @@ int task_can_attach(struct task_struct *p, * success of set_cpus_allowed_ptr() on all attached tasks * before cpus_mask may be changed. */ - if (p->flags & PF_NO_SETAFFINITY) { + if (p->flags & PF_NO_SETAFFINITY) ret = -EINVAL; - goto out; - } - - if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_effective_cpus)) { - int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); - if (unlikely(cpu >= nr_cpu_ids)) - return -EINVAL; - ret = dl_bw_alloc(cpu, p->dl.dl_bw); - } - -out: return ret; } -- cgit v1.2.3 From 854f5cc5b7355ceebf2bdfed97ea8f3c5d47a0c3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2023 16:47:07 -0700 Subject: Further upgrade queue_work_on() comment The current queue_work_on() docbook comment says that the caller must ensure that the specified CPU can't go away, and further says that the penalty for failing to nail down the specified CPU is that the workqueue handler might find itself executing on some other CPU. This is true as far as it goes, but fails to note what happens if the specified CPU never was online. Therefore, further expand this comment to say that specifying a CPU that was never online will result in a splat. Signed-off-by: Paul E. McKenney Cc: Lai Jiangshan Cc: Tejun Heo Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4666a1a92a31..36bccc1285b3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1539,6 +1539,8 @@ out: * We queue the work to a specific CPU, the caller must ensure it * can't go away. Callers that fail to ensure that the specified * CPU cannot go away will execute on a randomly chosen CPU. + * But note well that callers specifying a CPU that never has been + * online will get a splat. * * Return: %false if @work was already on a queue, %true otherwise. */ -- cgit v1.2.3 From d15121be7485655129101f3960ae6add40204463 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 8 May 2023 08:17:44 +0200 Subject: Revert "softirq: Let ksoftirqd do its job" This reverts the following commits: 4cd13c21b207 ("softirq: Let ksoftirqd do its job") 3c53776e29f8 ("Mark HI and TASKLET softirq synchronous") 1342d8080f61 ("softirq: Don't skip softirq execution when softirq thread is parking") in a single change to avoid known bad intermediate states introduced by a patch series reverting them individually. Due to the mentioned commit, when the ksoftirqd threads take charge of softirq processing, the system can experience high latencies. In the past a few workarounds have been implemented for specific side-effects of the initial ksoftirqd enforcement commit: commit 1ff688209e2e ("watchdog: core: make sure the watchdog_worker is not deferred") commit 8d5755b3f77b ("watchdog: softdog: fire watchdog even if softirqs do not get to run") commit 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()") commit 3c53776e29f8 ("Mark HI and TASKLET softirq synchronous") But the latency problem still exists in real-life workloads, see the link below. The reverted commit intended to solve a live-lock scenario that can now be addressed with the NAPI threaded mode, introduced with commit 29863d41bb6e ("net: implement threaded-able napi poll loop support"), which is nowadays in a pretty stable status. While a complete solution to put softirq processing under nice resource control would be preferable, that has proven to be a very hard task. In the short term, remove the main pain point, and also simplify a bit the current softirq implementation. Signed-off-by: Paolo Abeni Signed-off-by: Thomas Gleixner Tested-by: Jason Xing Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Reviewed-by: Sebastian Andrzej Siewior Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: netdev@vger.kernel.org Link: https://lore.kernel.org/netdev/305d7742212cbe98621b16be782b0562f1012cb6.camel@redhat.com Link: https://lore.kernel.org/r/57e66b364f1b6f09c9bc0316742c3b14f4ce83bd.1683526542.git.pabeni@redhat.com --- kernel/softirq.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 1b725510dd0f..807b34ccd797 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -80,21 +80,6 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } -/* - * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness, - * unless we're doing some of the synchronous softirqs. - */ -#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) -static bool ksoftirqd_running(unsigned long pending) -{ - struct task_struct *tsk = __this_cpu_read(ksoftirqd); - - if (pending & SOFTIRQ_NOW_MASK) - return false; - return tsk && task_is_running(tsk) && !__kthread_should_park(tsk); -} - #ifdef CONFIG_TRACE_IRQFLAGS DEFINE_PER_CPU(int, hardirqs_enabled); DEFINE_PER_CPU(int, hardirq_context); @@ -236,7 +221,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) goto out; pending = local_softirq_pending(); - if (!pending || ksoftirqd_running(pending)) + if (!pending) goto out; /* @@ -432,9 +417,6 @@ static inline bool should_wake_ksoftirqd(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running(local_softirq_pending())) - return; - if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* @@ -468,7 +450,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running(pending)) + if (pending) do_softirq_own_stack(); local_irq_restore(flags); -- cgit v1.2.3 From cdfa0f6fa6b7183c062046043b649b9a91e3ac52 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Apr 2023 16:49:14 -0700 Subject: rcu/kvfree: Add debug to check grace periods This commit adds debugging checks to verify that the required RCU grace period has elapsed for each kvfree_rcu_bulk_data structure that arrives at the kvfree_rcu_bulk() function. These checks make use of that structure's ->gp_snap field, which has been upgraded from an unsigned long to an rcu_gp_oldstate structure. This upgrade reduces the chances of false positives to nearly zero, even on 32-bit systems, for which this structure carries 64 bits of state. Cc: Ziwei Dai Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f52ff7241041..91d75fd6c579 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2756,7 +2756,7 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kvfree_rcu_bulk_data { struct list_head list; - unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap; unsigned long nr_records; void *records[]; }; @@ -2921,23 +2921,24 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, int i; debug_rcu_bhead_unqueue(bnode); - - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, bnode->records[i], 0); + + vfree(bnode->records[i]); + } } + rcu_lock_release(&rcu_callback_map); } - rcu_lock_release(&rcu_callback_map); raw_spin_lock_irqsave(&krcp->lock, flags); if (put_cached_bnode(krcp, bnode)) @@ -3081,7 +3082,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) INIT_LIST_HEAD(&bulk_ready[i]); list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu(bnode->gp_snap)) + if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) break; atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); @@ -3285,7 +3286,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // Finally insert and update the GP for this page. bnode->records[bnode->nr_records++] = ptr; - bnode->gp_snap = get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); return true; -- cgit v1.2.3 From f32276a37652a9ce05db27cdfb40ac3e3fc98f9f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 4 Apr 2023 16:13:00 +0200 Subject: rcu/kvfree: Add debug check for GP complete for kfree_rcu_cpu list Under low-memory conditions, kvfree_rcu() will use each object's rcu_head structure to queue objects in a singly linked list headed by the kfree_rcu_cpu structure's ->head field. This list is passed to call_rcu() as a unit, but there is no indication of which grace period this list needs to wait for. This in turn prevents adding debug checks in the kfree_rcu_work() as was done for the two page-of-pointers channels in the kfree_rcu_cpu structure. This commit therefore adds a ->head_free_gp_snap field to the kfree_rcu_cpu_work structure to record this grace-period number. It also adds a WARN_ON_ONCE() to kfree_rcu_monitor() that checks to make sure that the required grace period has in fact elapsed. [ paulmck: Fix kerneldoc issue raised by Stephen Rothwell. ] Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 91d75fd6c579..7452ba97ba34 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2773,6 +2773,7 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ @@ -2780,6 +2781,7 @@ struct kvfree_rcu_bulk_data { struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; @@ -2985,6 +2987,7 @@ static void kfree_rcu_work(struct work_struct *work) struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + struct rcu_gp_oldstate head_gp_snap; int i; krwp = container_of(to_rcu_work(work), @@ -2999,6 +3002,7 @@ static void kfree_rcu_work(struct work_struct *work) // Channel 3. head = krwp->head_free; krwp->head_free = NULL; + head_gp_snap = krwp->head_free_gp_snap; raw_spin_unlock_irqrestore(&krcp->lock, flags); // Handle the first two channels. @@ -3015,7 +3019,8 @@ static void kfree_rcu_work(struct work_struct *work) * queued on a linked list through their rcu_head structures. * This list is named "Channel 3". */ - kvfree_rcu_list(head); + if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) + kvfree_rcu_list(head); } static bool @@ -3147,6 +3152,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; + get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); atomic_set(&krcp->head_count, 0); WRITE_ONCE(krcp->head, NULL); } -- cgit v1.2.3 From 1e237994d9c9a5ae47ae13030585a413a29469e6 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 5 Apr 2023 10:13:59 +0800 Subject: rcu/kvfree: Invoke debug_rcu_bhead_unqueue() after checking bnode->gp_snap If kvfree_rcu_bulk() sees that the required grace period has failed to elapse, it leaks the memory because readers might still be using it. But in that case, the debug-objects subsystem still marks the relevant structures as having been freed, even though they are instead being leaked. This commit fixes this mismatch by invoking debug_rcu_bhead_unqueue() only when we are actually going to free the objects. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7452ba97ba34..426f1f3bb5f2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2922,8 +2922,8 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, unsigned long flags; int i; - debug_rcu_bhead_unqueue(bnode); if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); rcu_lock_acquire(&rcu_callback_map); if (idx == 0) { // kmalloc() / kfree(). trace_rcu_invoke_kfree_bulk_callback( -- cgit v1.2.3 From 309a4316507767f8078d30c9681dc76f4299b0f1 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sat, 8 Apr 2023 22:25:30 +0800 Subject: rcu/kvfree: Use consistent krcp when growing kfree_rcu() page cache The add_ptr_to_bulk_krc_lock() function is invoked to allocate a new kfree_rcu() page, also known as a kvfree_rcu_bulk_data structure. The kfree_rcu_cpu structure's lock is used to protect this operation, except that this lock must be momentarily dropped when allocating memory. It is clearly important that the lock that is reacquired be the same lock that was acquired initially via krc_this_cpu_lock(). Unfortunately, this same krc_this_cpu_lock() function is used to re-acquire this lock, and if the task migrated to some other CPU during the memory allocation, this will result in the kvfree_rcu_bulk_data structure being added to the wrong CPU's kfree_rcu_cpu structure. This commit therefore replaces that second call to krc_this_cpu_lock() with raw_spin_lock_irqsave() in order to explicitly acquire the lock on the correct kfree_rcu_cpu structure, thus keeping things straight even when the task migrates. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 426f1f3bb5f2..51d84eabf645 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3279,7 +3279,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // scenarios. bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - *krcp = krc_this_cpu_lock(flags); + raw_spin_lock_irqsave(&(*krcp)->lock, *flags); } if (!bnode) -- cgit v1.2.3 From 021a5ff8474379cd6c23e9b0e97aa27e5ff66a8b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 11 Apr 2023 15:13:41 +0200 Subject: rcu/kvfree: Do not run a page work if a cache is disabled By default the cache size is 5 pages per CPU, but it can be disabled at boot time by setting the rcu_min_cached_objs to zero. When that happens, the current code will uselessly set an hrtimer to schedule refilling this cache with zero pages. This commit therefore streamlines this process by simply refusing the set the hrtimer when rcu_min_cached_objs is zero. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d84eabf645..18f592bf6dc6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3225,6 +3225,10 @@ static void fill_page_cache_func(struct work_struct *work) static void run_page_cache_worker(struct kfree_rcu_cpu *krcp) { + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { -- cgit v1.2.3 From 60888b77a06ea16665e4df980bb86b418253e268 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 12 Apr 2023 22:31:27 +0800 Subject: rcu/kvfree: Make fill page cache start from krcp->nr_bkv_objs When the fill_page_cache_func() function is invoked, it assumes that the cache of pages is completely empty. However, there can be some time between triggering execution of this function and its actual invocation. During this time, kfree_rcu_work() might run, and might fill in part or all of this cache of pages, thus invalidating the fill_page_cache_func() function's assumption. This will not overfill the cache because put_cached_bnode() will reject the extra page. However, it will result in a needless allocation and freeing of one extra page, which might not be helpful under lowish-memory conditions. This commit therefore causes the fill_page_cache_func() to explicitly account for pages that have been placed into the cache shortly before it starts running. Signed-off-by: Zqiang Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 18f592bf6dc6..98f2e833e217 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3201,7 +3201,7 @@ static void fill_page_cache_func(struct work_struct *work) nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? 1 : rcu_min_cached_objs; - for (i = 0; i < nr_pages; i++) { + for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); -- cgit v1.2.3 From 6b706e5603c44ff0b6f43c2e26e0d590e1d265f8 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 18 Apr 2023 20:27:02 +0800 Subject: rcu/kvfree: Make drain_page_cache() take early return if cache is disabled If the rcutree.rcu_min_cached_objs kernel boot parameter is set to zero, then krcp->page_cache_work will never be triggered to fill page cache. In addition, the put_cached_bnode() will not fill page cache. As a result krcp->bkvcache will always be empty, so there is no need to acquire krcp->lock to get page from krcp->bkvcache. This commit therefore makes drain_page_cache() return immediately if the rcu_min_cached_objs is zero. Signed-off-by: Zqiang Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 98f2e833e217..00ed45ddc6ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2902,6 +2902,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) struct llist_node *page_list, *pos, *n; int freed = 0; + if (!rcu_min_cached_objs) + return 0; + raw_spin_lock_irqsave(&krcp->lock, flags); page_list = llist_del_all(&krcp->bkvcache); WRITE_ONCE(krcp->nr_bkv_objs, 0); -- cgit v1.2.3 From 5c83cedbaaad6dfe290e50658a204556ac5ac683 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Mar 2023 18:02:00 +0200 Subject: rcu/nocb: Protect lazy shrinker against concurrent (de-)offloading The shrinker may run concurrently with callbacks (de-)offloading. As such, calling rcu_nocb_lock() is very dangerous because it does a conditional locking. The worst outcome is that rcu_nocb_lock() doesn't lock but rcu_nocb_unlock() eventually unlocks, or the reverse, creating an imbalance. Fix this with protecting against (de-)offloading using the barrier mutex. Although if the barrier mutex is contended, which should be rare, then step aside so as not to trigger a mutex VS allocation dependency chain. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f2280616f9d5..1a86883902ce 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1336,13 +1336,33 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long flags; unsigned long count = 0; + /* + * Protect against concurrent (de-)offloading. Otherwise nocb locking + * may be ignored or imbalanced. + */ + if (!mutex_trylock(&rcu_state.barrier_mutex)) { + /* + * But really don't insist if barrier_mutex is contended since we + * can't guarantee that it will never engage in a dependency + * chain involving memory allocation. The lock is seldom contended + * anyway. + */ + return 0; + } + /* Snapshot count of all CPUs */ for_each_possible_cpu(cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int _count = READ_ONCE(rdp->lazy_len); + int _count; + + if (!rcu_rdp_is_offloaded(rdp)) + continue; + + _count = READ_ONCE(rdp->lazy_len); if (_count == 0) continue; + rcu_nocb_lock_irqsave(rdp, flags); WRITE_ONCE(rdp->lazy_len, 0); rcu_nocb_unlock_irqrestore(rdp, flags); @@ -1352,6 +1372,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) if (sc->nr_to_scan <= 0) break; } + + mutex_unlock(&rcu_state.barrier_mutex); + return count ? count : SHRINK_STOP; } -- cgit v1.2.3 From 7625926086765123251f765d91fc3a70617d334d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Mar 2023 18:02:01 +0200 Subject: rcu/nocb: Fix shrinker race against callback enqueuer The shrinker resets the lazy callbacks counter in order to trigger the pending lazy queue flush though the rcuog kthread. The counter reset is protected by the ->nocb_lock against concurrent accesses...except for one of them. Here is a list of existing synchronized readers/writer: 1) The first lazy enqueuer (incrementing ->lazy_len to 1) does so under ->nocb_lock and ->nocb_bypass_lock. 2) The further lazy enqueuers (incrementing ->lazy_len above 1) do so under ->nocb_bypass_lock _only_. 3) The lazy flush checks and resets to 0 under ->nocb_lock and ->nocb_bypass_lock. The shrinker protects its ->lazy_len reset against cases 1) and 3) but not against 2). As such, setting ->lazy_len to 0 under the ->nocb_lock may be cancelled right away by an overwrite from an enqueuer, leading rcuog to ignore the flush. To avoid that, use the proper bypass flush API which takes care of all those details. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 1a86883902ce..c321fce2af8e 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1364,7 +1364,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) continue; rcu_nocb_lock_irqsave(rdp, flags); - WRITE_ONCE(rdp->lazy_len, 0); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); sc->nr_to_scan -= _count; -- cgit v1.2.3 From b96a8b0b5be40f9bc9e45819f14b32ea9cdce73f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Mar 2023 18:02:02 +0200 Subject: rcu/nocb: Recheck lazy callbacks under the ->nocb_lock from shrinker The ->lazy_len is only checked locklessly. Recheck again under the ->nocb_lock to avoid spending more time on flushing/waking if not necessary. The ->lazy_len can still increment concurrently (from 1 to infinity) but under the ->nocb_lock we at least know for sure if there are lazy callbacks at all (->lazy_len > 0). Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index c321fce2af8e..dfa9c10d6727 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1358,12 +1358,20 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) if (!rcu_rdp_is_offloaded(rdp)) continue; - _count = READ_ONCE(rdp->lazy_len); - - if (_count == 0) + if (!READ_ONCE(rdp->lazy_len)) continue; rcu_nocb_lock_irqsave(rdp, flags); + /* + * Recheck under the nocb lock. Since we are not holding the bypass + * lock we may still race with increments from the enqueuer but still + * we know for sure if there is at least one lazy callback. + */ + _count = READ_ONCE(rdp->lazy_len); + if (!_count) { + rcu_nocb_unlock_irqrestore(rdp, flags); + continue; + } WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); -- cgit v1.2.3 From 5fc8cbe4cf0fd34ded8045c385790c3bf04f6785 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Wed, 3 Aug 2022 01:22:05 +0900 Subject: rcu-tasks: Avoid pr_info() with spin lock in cblist_init_generic() pr_info() is called with rtp->cbs_gbl_lock spin lock locked. Because pr_info() calls printk() that might sleep, this will result in BUG like below: [ 0.206455] cblist_init_generic: Setting adjustable number of callback queues. [ 0.206463] [ 0.206464] ============================= [ 0.206464] [ BUG: Invalid wait context ] [ 0.206465] 5.19.0-00428-g9de1f9c8ca51 #5 Not tainted [ 0.206466] ----------------------------- [ 0.206466] swapper/0/1 is trying to lock: [ 0.206467] ffffffffa0167a58 (&port_lock_key){....}-{3:3}, at: serial8250_console_write+0x327/0x4a0 [ 0.206473] other info that might help us debug this: [ 0.206473] context-{5:5} [ 0.206474] 3 locks held by swapper/0/1: [ 0.206474] #0: ffffffff9eb597e0 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic.constprop.0+0x14/0x1f0 [ 0.206478] #1: ffffffff9eb579c0 (console_lock){+.+.}-{0:0}, at: _printk+0x63/0x7e [ 0.206482] #2: ffffffff9ea77780 (console_owner){....}-{0:0}, at: console_emit_next_record.constprop.0+0x111/0x330 [ 0.206485] stack backtrace: [ 0.206486] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.19.0-00428-g9de1f9c8ca51 #5 [ 0.206488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 [ 0.206489] Call Trace: [ 0.206490] [ 0.206491] dump_stack_lvl+0x6a/0x9f [ 0.206493] __lock_acquire.cold+0x2d7/0x2fe [ 0.206496] ? stack_trace_save+0x46/0x70 [ 0.206497] lock_acquire+0xd1/0x2f0 [ 0.206499] ? serial8250_console_write+0x327/0x4a0 [ 0.206500] ? __lock_acquire+0x5c7/0x2720 [ 0.206502] _raw_spin_lock_irqsave+0x3d/0x90 [ 0.206504] ? serial8250_console_write+0x327/0x4a0 [ 0.206506] serial8250_console_write+0x327/0x4a0 [ 0.206508] console_emit_next_record.constprop.0+0x180/0x330 [ 0.206511] console_unlock+0xf7/0x1f0 [ 0.206512] vprintk_emit+0xf7/0x330 [ 0.206514] _printk+0x63/0x7e [ 0.206516] cblist_init_generic.constprop.0.cold+0x24/0x32 [ 0.206518] rcu_init_tasks_generic+0x5/0xd9 [ 0.206522] kernel_init_freeable+0x15b/0x2a2 [ 0.206523] ? rest_init+0x160/0x160 [ 0.206526] kernel_init+0x11/0x120 [ 0.206527] ret_from_fork+0x1f/0x30 [ 0.206530] [ 0.207018] cblist_init_generic: Setting shift to 1 and lim to 1. This patch moves pr_info() so that it is called without rtp->cbs_gbl_lock locked. Signed-off-by: Shigeru Yoshida Tested-by: "Zhang, Qiang1" Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5f4fc8184dd0..65df1aaf0ce9 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -241,7 +241,6 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; rcu_task_cb_adjust = true; - pr_info("%s: Setting adjustable number of callback queues.\n", __func__); } else if (rcu_task_enqueue_lim == 0) { rcu_task_enqueue_lim = 1; } @@ -272,6 +271,10 @@ static void cblist_init_generic(struct rcu_tasks *rtp) raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + + if (rcu_task_cb_adjust) + pr_info("%s: Setting adjustable number of callback queues.\n", __func__); + pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); } -- cgit v1.2.3 From edff5e9a99e0ed9463999455b2604c3154eb7ab3 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 23 Mar 2023 12:00:11 +0800 Subject: rcu-tasks: Clarify the cblist_init_generic() function's pr_info() output This commit uses rtp->name instead of __func__ and outputs the value of rcu_task_cb_adjust, thus reducing console-log output. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 65df1aaf0ce9..cf7b00af9474 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -272,10 +272,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); - if (rcu_task_cb_adjust) - pr_info("%s: Setting adjustable number of callback queues.\n", __func__); - - pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, + data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); } // IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). -- cgit v1.2.3 From fea1c1f0101783f24d00e065ecd3d6e90292f887 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Mar 2023 16:43:54 -0700 Subject: rcu: Check callback-invocation time limit for rcuc kthreads Currently, a callback-invocation time limit is enforced only for callbacks invoked from the softirq environment, the rationale being that when callbacks are instead invoked from rcuc and rcuoc kthreads, these callbacks cannot be holding up other softirq vectors. Which is in fact true. However, if an rcuc kthread spends too much time invoking callbacks, it can delay quiescent-state reports from its CPU, which can also be a problem. This commit therefore applies the callback-invocation time limit to callback invocation from the rcuc kthreads as well as from softirq. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f52ff7241041..9a5c160186d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2046,6 +2046,13 @@ rcu_check_quiescent_state(struct rcu_data *rdp) rcu_report_qs_rdp(rdp); } +/* Return true if callback-invocation time limit exceeded. */ +static bool rcu_do_batch_check_time(long count, long tlimit) +{ + // Invoke local_clock() only once per 32 consecutive callbacks. + return unlikely(tlimit) && !likely(count & 31) && local_clock() >= tlimit; +} + /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Throttle as specified by rdp->blimit. @@ -2082,7 +2089,8 @@ static void rcu_do_batch(struct rcu_data *rdp) div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (in_serving_softirq() && unlikely(bl > 100)) { + if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && + unlikely(bl > 100)) { long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; @@ -2126,21 +2134,23 @@ static void rcu_do_batch(struct rcu_data *rdp) * Make sure we don't spend too much time here and deprive other * softirq vectors of CPU cycles. */ - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ + if (rcu_do_batch_check_time(count, tlimit)) break; - } } else { - // In rcuoc context, so no worries about depriving - // other softirq vectors of CPU cycles. + // In rcuc/rcuoc context, so no worries about + // depriving other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); lockdep_assert_irqs_enabled(); local_bh_disable(); + // But rcuc kthreads can delay quiescent-state + // reporting, so check time limits for them. + if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && + rcu_do_batch_check_time(count, tlimit)) { + rdp->rcu_cpu_has_work = 1; + break; + } } } -- cgit v1.2.3 From f51164a808b5bf1d81fc37eb53ab1eae59c79f2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Mar 2023 09:05:56 -0700 Subject: rcu: Employ jiffies-based backstop to callback time limit Currently, if there are more than 100 ready-to-invoke RCU callbacks queued on a given CPU, the rcu_do_batch() function sets a timeout for invocation of the series. This timeout defaulting to three milliseconds, and may be adjusted using the rcutree.rcu_resched_ns kernel boot parameter. This timeout is checked using local_clock(), but the overhead of this function combined with the common-case very small callback-invocation overhead means that local_clock() is checked every 32nd invocation. This works well except for longer-than average callbacks. For example, a series of 500-microsecond-duration callbacks means that local_clock() is checked only once every 16 milliseconds, which makes it difficult to enforce a three-millisecond timeout. This commit therefore adds a Kconfig option RCU_DOUBLE_CHECK_CB_TIME that enables backup timeout checking using the coarser grained but lighter weight jiffies. If the jiffies counter detects a timeout, then local_clock() is consulted even if this is not the 32nd callback. This prevents the aforementioned 16-millisecond latency blow. Reported-by: Domas Mituzas Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 18 ++++++++++++++++++ kernel/rcu/tree.c | 28 ++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 9071182b1284..bdd7eadb33d8 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -314,4 +314,22 @@ config RCU_LAZY To save power, batch RCU callbacks and flush after delay, memory pressure, or callback list growing too big. +config RCU_DOUBLE_CHECK_CB_TIME + bool "RCU callback-batch backup time check" + depends on RCU_EXPERT + default n + help + Use this option to provide more precise enforcement of the + rcutree.rcu_resched_ns module parameter in situations where + a single RCU callback might run for hundreds of microseconds, + thus defeating the 32-callback batching used to amortize the + cost of the fine-grained but expensive local_clock() function. + + This option rounds rcutree.rcu_resched_ns up to the next + jiffy, and overrides the 32-callback batching if this limit + is exceeded. + + Say Y here if you need tighter callback-limit enforcement. + Say N here if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a5c160186d1..e2dbea6cee4b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2047,10 +2047,15 @@ rcu_check_quiescent_state(struct rcu_data *rdp) } /* Return true if callback-invocation time limit exceeded. */ -static bool rcu_do_batch_check_time(long count, long tlimit) +static bool rcu_do_batch_check_time(long count, long tlimit, + bool jlimit_check, unsigned long jlimit) { // Invoke local_clock() only once per 32 consecutive callbacks. - return unlikely(tlimit) && !likely(count & 31) && local_clock() >= tlimit; + return unlikely(tlimit) && + (!likely(count & 31) || + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) && + jlimit_check && time_after(jiffies, jlimit))) && + local_clock() >= tlimit; } /* @@ -2059,13 +2064,17 @@ static bool rcu_do_batch_check_time(long count, long tlimit) */ static void rcu_do_batch(struct rcu_data *rdp) { + long bl; + long count = 0; int div; bool __maybe_unused empty; unsigned long flags; - struct rcu_head *rhp; + unsigned long jlimit; + bool jlimit_check = false; + long pending; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count = 0; - long pending, tlimit = 0; + struct rcu_head *rhp; + long tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2090,11 +2099,14 @@ static void rcu_do_batch(struct rcu_data *rdp) div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && - unlikely(bl > 100)) { + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) { + const long npj = NSEC_PER_SEC / HZ; long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; tlimit = local_clock() + rrn; + jlimit = jiffies + (rrn + npj + 1) / npj; + jlimit_check = true; } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); @@ -2134,7 +2146,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * Make sure we don't spend too much time here and deprive other * softirq vectors of CPU cycles. */ - if (rcu_do_batch_check_time(count, tlimit)) + if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) break; } else { // In rcuc/rcuoc context, so no worries about @@ -2147,7 +2159,7 @@ static void rcu_do_batch(struct rcu_data *rdp) // But rcuc kthreads can delay quiescent-state // reporting, so check time limits for them. if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && - rcu_do_batch_check_time(count, tlimit)) { + rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) { rdp->rcu_cpu_has_work = 1; break; } -- cgit v1.2.3 From 9146eb25495ea8bfb5010192e61e3ed5805ce9ef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Apr 2023 16:05:38 -0700 Subject: rcu: Mark additional concurrent load from ->cpu_no_qs.b.exp The per-CPU rcu_data structure's ->cpu_no_qs.b.exp field is updated only on the instance corresponding to the current CPU, but can be read more widely. Unmarked accesses are OK from the corresponding CPU, but only if interrupts are disabled, given that interrupt handlers can and do modify this field. Unfortunately, although the load from rcu_preempt_deferred_qs() is always carried out from the corresponding CPU, interrupts are not necessarily disabled. This commit therefore upgrades this load to READ_ONCE. Similarly, the diagnostic access from synchronize_rcu_expedited_wait() might run with interrupts disabled and from some other CPU. This commit therefore marks this load with data_race(). Finally, the C-language access in rcu_preempt_ctxt_queue() is OK as is because interrupts are disabled and this load is always from the corresponding CPU. This commit adds a comment giving the rationale for this access being safe. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_plugin.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3b7abb58157d..8239b39d945b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -643,7 +643,7 @@ static void synchronize_rcu_expedited_wait(void) "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], - "D."[!!(rdp->cpu_no_qs.b.exp)]); + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7b0fe741a088..41021080ad25 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -257,6 +257,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * GP should not be able to end until we report, so there should be * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) + * + * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change. */ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); @@ -941,7 +943,7 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (rdp->cpu_no_qs.b.exp) + if (READ_ONCE(rdp->cpu_no_qs.b.exp)) rcu_report_exp_rdp(rdp); } -- cgit v1.2.3 From a24c1aab652ebacf9ea62470a166514174c96fe1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Apr 2023 16:47:34 -0700 Subject: rcu: Mark rcu_cpu_kthread() accesses to ->rcu_cpu_has_work The rcu_data structure's ->rcu_cpu_has_work field can be modified by any CPU attempting to wake up the rcuc kthread. Therefore, this commit marks accesses to this field from the rcu_cpu_kthread() function. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e2dbea6cee4b..faa1c01d2917 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2481,12 +2481,12 @@ static void rcu_cpu_kthread(unsigned int cpu) *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); work = *workp; - *workp = 0; + WRITE_ONCE(*workp, 0); local_irq_enable(); if (work) rcu_core(); local_bh_enable(); - if (*workp == 0) { + if (!READ_ONCE(*workp)) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; -- cgit v1.2.3 From 15d44dfa40305da1648de4bf001e91cc63148725 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Apr 2023 10:50:47 -0700 Subject: rcu: Make rcu_cpu_starting() rely on interrupts being disabled Currently, rcu_cpu_starting() is written so that it might be invoked with interrupts enabled. However, it is always called when interrupts are disabled, either by rcu_init(), notify_cpu_starting(), or from a call point prior to the call to notify_cpu_starting(). But why bother requiring that interrupts be disabled? The purpose is to allow the rcu_data structure's ->beenonline flag to be set after all early processing has completed for the incoming CPU, thus allowing this flag to be used to determine when workqueues have been set up for the incoming CPU, while still allowing this flag to be used as a diagnostic within rcu_core(). This commit therefore makes rcu_cpu_starting() rely on interrupts being disabled. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index faa1c01d2917..23685407dcf6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4390,15 +4390,16 @@ int rcutree_offline_cpu(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * This incoming CPU must not have enabled interrupts yet. */ void rcu_cpu_starting(unsigned int cpu) { - unsigned long flags; unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp; bool newcpu; + lockdep_assert_irqs_disabled(); rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu_started) return; @@ -4406,7 +4407,6 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; - local_irq_save(flags); arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); raw_spin_lock(&rcu_state.barrier_lock); @@ -4425,17 +4425,16 @@ void rcu_cpu_starting(unsigned int cpu) /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ /* rcu_report_qs_rnp() *really* wants some flags to restore */ - unsigned long flags2; + unsigned long flags; - local_irq_save(flags2); + local_irq_save(flags); rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { raw_spin_unlock_rcu_node(rnp); } arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -- cgit v1.2.3 From 401b0de3ae4fa49d1014c8941e26d9a25f37e7cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Apr 2023 11:11:29 -0700 Subject: rcu-tasks: Stop rcu_tasks_invoke_cbs() from using never-onlined CPUs The rcu_tasks_invoke_cbs() function relies on queue_work_on() to silently fall back to WORK_CPU_UNBOUND when the specified CPU is offline. However, the queue_work_on() function's silent fallback mechanism relies on that CPU having been online at some time in the past. When queue_work_on() is passed a CPU that has never been online, workqueue lockups ensue, which can be bad for your kernel's general health and well-being. This commit therefore checks whether a given CPU has ever been online, and, if not substitutes WORK_CPU_UNBOUND in the subsequent call to queue_work_on(). Why not simply omit the queue_work_on() call entirely? Because this function is flooding callback-invocation notifications to all CPUs, and must deal with possibilities that include a sparse cpu_possible_mask. This commit also moves the setting of the rcu_data structure's ->beenonline field to rcu_cpu_starting(), which executes on the incoming CPU before that CPU has ever enabled interrupts. This ensures that the required workqueues are present. In addition, because the incoming CPU has not yet enabled its interrupts, there cannot yet have been any softirq handlers running on this CPU, which means that the WARN_ON_ONCE(!rdp->beenonline) within the RCU_SOFTIRQ handler cannot have triggered yet. Fixes: d363f833c6d88 ("rcu-tasks: Use workqueues for multiple rcu_tasks_invoke_cbs() invocations") Reported-by: Tejun Heo Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 6 ++++++ kernel/rcu/tasks.h | 7 +++++-- kernel/rcu/tree.c | 12 +++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4a1b9622598b..98c1544cf572 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -642,4 +642,10 @@ void show_rcu_tasks_trace_gp_kthread(void); static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif +#ifdef CONFIG_TINY_RCU +static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } +#else +bool rcu_cpu_beenfullyonline(int cpu); +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5f4fc8184dd0..8f08c087142b 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -463,6 +463,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu { int cpu; int cpunext; + int cpuwq; unsigned long flags; int len; struct rcu_head *rhp; @@ -473,11 +474,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu cpunext = cpu * 2 + 1; if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); cpunext++; if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); } } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 23685407dcf6..54963f8c244c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4305,7 +4305,6 @@ int rcutree_prepare_cpu(unsigned int cpu) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rdp->beenonline = true; /* We have now been online. */ rdp->gp_seq = READ_ONCE(rnp->gp_seq); rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; @@ -4332,6 +4331,16 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); } +/* + * Has the specified (known valid) CPU ever been fully online? + */ +bool rcu_cpu_beenfullyonline(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return smp_load_acquire(&rdp->beenonline); +} + /* * Near the end of the CPU-online process. Pretty much all services * enabled, and the CPU is now very much alive. @@ -4435,6 +4444,7 @@ void rcu_cpu_starting(unsigned int cpu) raw_spin_unlock_rcu_node(rnp); } arch_spin_unlock(&rcu_state.ofl_lock); + smp_store_release(&rdp->beenonline, true); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -- cgit v1.2.3 From fbde57d2d2995375305917b3c944bc861beb84d4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Mar 2023 18:02:03 +0200 Subject: rcu/nocb: Make shrinker iterate only over NOCB CPUs Callbacks can only be queued as lazy on NOCB CPUs, therefore iterating over the NOCB mask is enough for both counting and scanning. Just lock the mostly uncontended barrier mutex on counting as well in order to keep rcu_nocb_mask stable. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index dfa9c10d6727..43229d2b0c44 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1319,13 +1319,22 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) int cpu; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + + /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ + if (!mutex_trylock(&rcu_state.barrier_mutex)) + return 0; + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); count += READ_ONCE(rdp->lazy_len); } + mutex_unlock(&rcu_state.barrier_mutex); + return count ? count : SHRINK_EMPTY; } @@ -1336,6 +1345,8 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long flags; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; /* * Protect against concurrent (de-)offloading. Otherwise nocb locking * may be ignored or imbalanced. @@ -1351,11 +1362,11 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) } /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int _count; - if (!rcu_rdp_is_offloaded(rdp)) + if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp))) continue; if (!READ_ONCE(rdp->lazy_len)) -- cgit v1.2.3 From f8619c300f49c5831d344d35df93d3af447efc97 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Mar 2023 20:48:13 -0800 Subject: locktorture: Add long_hold to adjust lock-hold delays This commit adds a long_hold module parameter to allow testing diagnostics for excessive lock-hold times. Also adjust torture_param() invocations for longer line length while in the area. Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/locking/locktorture.c | 51 +++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 153ddc4c47ef..949d3deae506 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,24 +33,19 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); -torture_param(int, nwriters_stress, -1, - "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, - "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); -torture_param(int, shuffle_interval, 3, - "Number of jiffies between shuffles, 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(int, rt_boost, 2, - "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 @@ -120,7 +115,7 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % @@ -198,16 +193,18 @@ __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; + unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) { + j = jiffies; mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); + } + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -322,7 +319,7 @@ __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. @@ -455,14 +452,12 @@ __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 5); - else - mdelay(longdelay_ms / 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -630,7 +625,7 @@ __acquires(torture_rtmutex) static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* * We want a short delay mostly to emulate likely code, and @@ -640,7 +635,7 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -695,14 +690,12 @@ __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 10); - else - mdelay(longdelay_ms / 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -848,8 +841,8 @@ static int lock_torture_writer(void *arg) lwsp->n_lock_acquired++; } - cxt.cur_ops->write_delay(&rand); if (!skip_main_lock) { + cxt.cur_ops->write_delay(&rand); lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(tid); -- cgit v1.2.3 From bf5ddd736509a7d9077c0b6793e6f0852214dbea Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Wed, 22 Mar 2023 19:42:40 +0800 Subject: rcu/rcuscale: Move rcu_scale_*() after kfree_scale_cleanup() This code-movement-only commit moves the rcu_scale_cleanup() and rcu_scale_shutdown() functions to follow kfree_scale_cleanup(). This is code movement is in preparation for a bug-fix patch that invokes kfree_scale_cleanup() from rcu_scale_cleanup(). Signed-off-by: Qiuxu Zhuo Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/rcuscale.c | 194 +++++++++++++++++++++++++------------------------- 1 file changed, 97 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index e82ec9f9a5d8..7e8965b0827a 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -522,89 +522,6 @@ rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); } -static void -rcu_scale_cleanup(void) -{ - int i; - int j; - int ngps = 0; - u64 *wdp; - u64 *wdpp; - - /* - * Would like warning at start, but everything is expedited - * during the mid-boot phase, so have to wait till the end. - */ - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - if (rcu_gp_is_normal() && gp_exp) - SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - if (gp_exp && gp_async) - SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); - - if (torture_cleanup_begin()) - return; - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_scale_reader, - reader_tasks[i]); - kfree(reader_tasks); - } - - if (writer_tasks) { - for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_scale_writer, - writer_tasks[i]); - if (!writer_n_durations) - continue; - j = writer_n_durations[i]; - pr_alert("%s%s writer %d gps: %d\n", - scale_type, SCALE_FLAG, i, j); - ngps += j; - } - pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - scale_type, SCALE_FLAG, - t_rcu_scale_writer_started, t_rcu_scale_writer_finished, - t_rcu_scale_writer_finished - - t_rcu_scale_writer_started, - ngps, - rcuscale_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); - for (i = 0; i < nrealwriters; i++) { - if (!writer_durations) - break; - if (!writer_n_durations) - continue; - wdpp = writer_durations[i]; - if (!wdpp) - continue; - for (j = 0; j < writer_n_durations[i]; j++) { - wdp = &wdpp[j]; - pr_alert("%s%s %4d writer-duration: %5d %llu\n", - scale_type, SCALE_FLAG, - i, j, *wdp); - if (j % 100 == 0) - schedule_timeout_uninterruptible(1); - } - kfree(writer_durations[i]); - } - kfree(writer_tasks); - kfree(writer_durations); - kfree(writer_n_durations); - } - - /* Do torture-type-specific cleanup operations. */ - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); -} - /* * Return the number if non-negative. If -1, the number of CPUs. * If less than -1, that much less than the number of CPUs, but @@ -624,20 +541,6 @@ static int compute_real(int n) return nr; } -/* - * RCU scalability shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_scale_shutdown(void *arg) -{ - wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_scale_cleanup(); - kernel_power_off(); - return -EINVAL; -} - /* * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. @@ -874,6 +777,103 @@ unwind: return firsterr; } +static void +rcu_scale_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + + if (torture_cleanup_begin()) + return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_scale_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_scale_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + scale_type, SCALE_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, + ngps, + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j < writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + scale_type, SCALE_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do torture-type-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + static int __init rcu_scale_init(void) { -- cgit v1.2.3 From 23fc8df26dead16687ae6eb47b0561a4a832e2f6 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Wed, 22 Mar 2023 19:42:41 +0800 Subject: rcu/rcuscale: Stop kfree_scale_thread thread(s) after unloading rcuscale Running the 'kfree_rcu_test' test case [1] results in a splat [2]. The root cause is the kfree_scale_thread thread(s) continue running after unloading the rcuscale module. This commit fixes that isue by invoking kfree_scale_cleanup() from rcu_scale_cleanup() when removing the rcuscale module. [1] modprobe rcuscale kfree_rcu_test=1 // After some time rmmod rcuscale rmmod torture [2] BUG: unable to handle page fault for address: ffffffffc0601a87 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page PGD 11de4f067 P4D 11de4f067 PUD 11de51067 PMD 112f4d067 PTE 0 Oops: 0010 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 1798 Comm: kfree_scale_thr Not tainted 6.3.0-rc1-rcu+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 RIP: 0010:0xffffffffc0601a87 Code: Unable to access opcode bytes at 0xffffffffc0601a5d. RSP: 0018:ffffb25bc2e57e18 EFLAGS: 00010297 RAX: 0000000000000000 RBX: ffffffffc061f0b6 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff962fd0de RDI: ffffffff962fd0de RBP: ffffb25bc2e57ea8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: 000000000000000a R15: 00000000001c1dbe FS: 0000000000000000(0000) GS:ffff921fa2200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffc0601a5d CR3: 000000011de4c006 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? kvfree_call_rcu+0xf0/0x3a0 ? kthread+0xf3/0x120 ? kthread_complete_and_exit+0x20/0x20 ? ret_from_fork+0x1f/0x30 Modules linked in: rfkill sunrpc ... [last unloaded: torture] CR2: ffffffffc0601a87 ---[ end trace 0000000000000000 ]--- Fixes: e6e78b004fa7 ("rcuperf: Add kfree_rcu() performance Tests") Reviewed-by: Davidlohr Bueso Reviewed-by: Joel Fernandes (Google) Signed-off-by: Qiuxu Zhuo Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 7e8965b0827a..d1221731c7cf 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -797,6 +797,11 @@ rcu_scale_cleanup(void) if (gp_exp && gp_async) SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + if (kfree_rcu_test) { + kfree_scale_cleanup(); + return; + } + if (torture_cleanup_begin()) return; if (!cur_ops) { -- cgit v1.2.3 From ee9fd0ac3017c4313be91a220a9ac4c99dde7ad4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 10 May 2023 21:37:48 -0700 Subject: bpf: Address KCSAN report on bpf_lru_list KCSAN reported a data-race when accessing node->ref. Although node->ref does not have to be accurate, take this chance to use a more common READ_ONCE() and WRITE_ONCE() pattern instead of data_race(). There is an existing bpf_lru_node_is_ref() and bpf_lru_node_set_ref(). This patch also adds bpf_lru_node_clear_ref() to do the WRITE_ONCE(node->ref, 0) also. ================================================================== BUG: KCSAN: data-race in __bpf_lru_list_rotate / __htab_lru_percpu_map_update_elem write to 0xffff888137038deb of 1 bytes by task 11240 on cpu 1: __bpf_lru_node_move kernel/bpf/bpf_lru_list.c:113 [inline] __bpf_lru_list_rotate_active kernel/bpf/bpf_lru_list.c:149 [inline] __bpf_lru_list_rotate+0x1bf/0x750 kernel/bpf/bpf_lru_list.c:240 bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:329 [inline] bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline] bpf_lru_pop_free+0x638/0xe20 kernel/bpf/bpf_lru_list.c:499 prealloc_lru_pop kernel/bpf/hashtab.c:290 [inline] __htab_lru_percpu_map_update_elem+0xe7/0x820 kernel/bpf/hashtab.c:1316 bpf_percpu_hash_update+0x5e/0x90 kernel/bpf/hashtab.c:2313 bpf_map_update_value+0x2a9/0x370 kernel/bpf/syscall.c:200 generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1687 bpf_map_do_batch+0x2d9/0x3d0 kernel/bpf/syscall.c:4534 __sys_bpf+0x338/0x810 __do_sys_bpf kernel/bpf/syscall.c:5096 [inline] __se_sys_bpf kernel/bpf/syscall.c:5094 [inline] __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5094 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff888137038deb of 1 bytes by task 11241 on cpu 0: bpf_lru_node_set_ref kernel/bpf/bpf_lru_list.h:70 [inline] __htab_lru_percpu_map_update_elem+0x2f1/0x820 kernel/bpf/hashtab.c:1332 bpf_percpu_hash_update+0x5e/0x90 kernel/bpf/hashtab.c:2313 bpf_map_update_value+0x2a9/0x370 kernel/bpf/syscall.c:200 generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1687 bpf_map_do_batch+0x2d9/0x3d0 kernel/bpf/syscall.c:4534 __sys_bpf+0x338/0x810 __do_sys_bpf kernel/bpf/syscall.c:5096 [inline] __se_sys_bpf kernel/bpf/syscall.c:5094 [inline] __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5094 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x01 -> 0x00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11241 Comm: syz-executor.3 Not tainted 6.3.0-rc7-syzkaller-00136-g6a66fdd29ea1 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/30/2023 ================================================================== Reported-by: syzbot+ebe648a84e8784763f82@syzkaller.appspotmail.com Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230511043748.1384166-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lru_list.c | 21 +++++++++++++-------- kernel/bpf/bpf_lru_list.h | 7 ++----- 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index d99e89f113c4..3dabdd137d10 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -41,7 +41,12 @@ static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { - return node->ref; + return READ_ONCE(node->ref); +} + +static void bpf_lru_node_clear_ref(struct bpf_lru_node *node) +{ + WRITE_ONCE(node->ref, 0); } static void bpf_lru_list_count_inc(struct bpf_lru_list *l, @@ -89,7 +94,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_move(&node->list, &l->lists[tgt_type]); } @@ -110,7 +115,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l, bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; } - node->ref = 0; + bpf_lru_node_clear_ref(node); /* If the moving node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. @@ -353,7 +358,7 @@ static void __local_list_add_pending(struct bpf_lru *lru, *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_add(&node->list, local_pending_list(loc_l)); } @@ -419,7 +424,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, if (!list_empty(free_list)) { node = list_first_entry(free_list, struct bpf_lru_node, list); *(u32 *)((void *)node + lru->hash_offset) = hash; - node->ref = 0; + bpf_lru_node_clear_ref(node); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } @@ -522,7 +527,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, } node->type = BPF_LRU_LOCAL_LIST_T_FREE; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l)); raw_spin_unlock_irqrestore(&loc_l->lock, flags); @@ -568,7 +573,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } @@ -594,7 +599,7 @@ again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; buf += elem_size; diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 4ea227c9c1ad..8f3c8b2b4490 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -64,11 +64,8 @@ struct bpf_lru { static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) { - /* ref is an approximation on access frequency. It does not - * have to be very accurate. Hence, no protection is used. - */ - if (!node->ref) - node->ref = 1; + if (!READ_ONCE(node->ref)) + WRITE_ONCE(node->ref, 1); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, -- cgit v1.2.3 From 29ebbba7d46136cba324264e513a1e964ca16c0a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 11 May 2023 10:04:53 -0700 Subject: bpf: Don't EFAULT for {g,s}setsockopt with wrong optlen With the way the hooks implemented right now, we have a special condition: optval larger than PAGE_SIZE will expose only first 4k into BPF; any modifications to the optval are ignored. If the BPF program doesn't handle this condition by resetting optlen to 0, the userspace will get EFAULT. The intention of the EFAULT was to make it apparent to the developers that the program is doing something wrong. However, this inadvertently might affect production workloads with the BPF programs that are not too careful (i.e., returning EFAULT for perfectly valid setsockopt/getsockopt calls). Let's try to minimize the chance of BPF program screwing up userspace by ignoring the output of those BPF programs (instead of returning EFAULT to the userspace). pr_info_once those cases to the dmesg to help with figuring out what's going wrong. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Suggested-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230511170456.1759459-2-sdf@google.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/cgroup.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a06e118a9be5..14c870595428 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1826,6 +1826,12 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, ret = 1; } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { /* optlen is out of bounds */ + if (*optlen > PAGE_SIZE && ctx.optlen >= 0) { + pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", + ctx.optlen, max_optlen); + ret = 0; + goto out; + } ret = -EFAULT; } else { /* optlen within bounds, run kernel handler */ @@ -1881,8 +1887,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, .optname = optname, .current_task = current, }; + int orig_optlen; int ret; + orig_optlen = max_optlen; ctx.optlen = max_optlen; max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) @@ -1905,6 +1913,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = -EFAULT; goto out; } + orig_optlen = ctx.optlen; if (copy_from_user(ctx.optval, optval, min(ctx.optlen, max_optlen)) != 0) { @@ -1922,6 +1931,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) { + if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) { + pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", + ctx.optlen, max_optlen); + ret = retval; + goto out; + } ret = -EFAULT; goto out; } -- cgit v1.2.3 From ba831b7b1a517ba7f25d6fa9736a8092d07b0c74 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 May 2023 23:07:00 +0200 Subject: cpu/hotplug: Mark arch_disable_smp_support() and bringup_nonboot_cpus() __init No point in keeping them around. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley Tested-by: Oleksandr Natalenko Tested-by: Helge Deller # parisc Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://lore.kernel.org/r/20230512205255.551974164@linutronix.de --- kernel/cpu.c | 2 +- kernel/smp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f4a2c5845bcb..c0d859c68168 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1502,7 +1502,7 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu) return 0; } -void bringup_nonboot_cpus(unsigned int setup_max_cpus) +void __init bringup_nonboot_cpus(unsigned int setup_max_cpus) { unsigned int cpu; diff --git a/kernel/smp.c b/kernel/smp.c index ab3e5dad6cfe..71dce748dbf0 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -892,7 +892,7 @@ EXPORT_SYMBOL(setup_max_cpus); * SMP mode to . */ -void __weak arch_disable_smp_support(void) { } +void __weak __init arch_disable_smp_support(void) { } static int __init nosmp(char *str) { -- cgit v1.2.3 From 22b612e2d53f6e13ce7b55ed565a104512f0eb00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 May 2023 23:07:21 +0200 Subject: cpu/hotplug: Rework sparse_irq locking in bringup_cpu() There is no harm to hold sparse_irq lock until the upcoming CPU completes in cpuhp_online_idle(). This allows to remove cpu_online() synchronization from architecture code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley Tested-by: Oleksandr Natalenko Tested-by: Helge Deller # parisc Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://lore.kernel.org/r/20230512205256.263722880@linutronix.de --- kernel/cpu.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index c0d859c68168..df8f137f0271 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -558,7 +558,7 @@ static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st, return ret; } -static int bringup_wait_for_ap(unsigned int cpu) +static int bringup_wait_for_ap_online(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -579,15 +579,12 @@ static int bringup_wait_for_ap(unsigned int cpu) */ if (!cpu_smt_allowed(cpu)) return -ECANCELED; - - if (st->target <= CPUHP_AP_ONLINE_IDLE) - return 0; - - return cpuhp_kick_ap(cpu, st, st->target); + return 0; } static int bringup_cpu(unsigned int cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle = idle_thread_get(cpu); int ret; @@ -600,16 +597,33 @@ static int bringup_cpu(unsigned int cpu) /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. - * Prevent irq alloc/free across the bringup. + * + * Prevent irq alloc/free across the bringup by acquiring the + * sparse irq lock. Hold it until the upcoming CPU completes the + * startup in cpuhp_online_idle() which allows to avoid + * intermediate synchronization points in the architecture code. */ irq_lock_sparse(); /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); - irq_unlock_sparse(); if (ret) - return ret; - return bringup_wait_for_ap(cpu); + goto out_unlock; + + ret = bringup_wait_for_ap_online(cpu); + if (ret) + goto out_unlock; + + irq_unlock_sparse(); + + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(cpu, st, st->target); + +out_unlock: + irq_unlock_sparse(); + return ret; } static int finish_cpu(unsigned int cpu) -- cgit v1.2.3 From 6f0621238b7e7680d5e26c00aa4cd473314d05b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 May 2023 23:07:27 +0200 Subject: cpu/hotplug: Add CPU state tracking and synchronization The CPU state tracking and synchronization mechanism in smpboot.c is completely independent of the hotplug code and all logic around it is implemented in architecture specific code. Except for the state reporting of the AP there is absolutely nothing architecture specific and the sychronization and decision functions can be moved into the generic hotplug core code. Provide an integrated variant and add the core synchronization and decision points. This comes in two flavours: 1) DEAD state synchronization Updated by the architecture code once the AP reaches the point where it is ready to be torn down by the control CPU, e.g. by removing power or clocks or tear down via the hypervisor. The control CPU waits for this state to be reached with a timeout. If the state is reached an architecture specific cleanup function is invoked. 2) Full state synchronization This extends #1 with AP alive synchronization. This is new functionality, which allows to replace architecture specific wait mechanims, e.g. cpumasks, completely. It also prevents that an AP which is in a limbo state can be brought up again. This can happen when an AP failed to report dead state during a previous off-line operation. The dead synchronization is what most architectures use. Only x86 makes a bringup decision based on that state at the moment. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley Tested-by: Oleksandr Natalenko Tested-by: Helge Deller # parisc Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://lore.kernel.org/r/20230512205256.476305035@linutronix.de --- kernel/cpu.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/smpboot.c | 2 + 2 files changed, 194 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index df8f137f0271..64b624291316 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,7 @@ * @last: For multi-instance rollback, remember how far we got * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation + * @ap_sync_state: State for AP synchronization * @done_up: Signal completion to the issuer of the task for cpu-up * @done_down: Signal completion to the issuer of the task for cpu-down */ @@ -76,6 +78,7 @@ struct cpuhp_cpu_state { struct hlist_node *last; enum cpuhp_state cb_state; int result; + atomic_t ap_sync_state; struct completion done_up; struct completion done_down; #endif @@ -276,6 +279,182 @@ static bool cpuhp_is_atomic_state(enum cpuhp_state state) return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; } +/* Synchronization state management */ +enum cpuhp_sync_state { + SYNC_STATE_DEAD, + SYNC_STATE_KICKED, + SYNC_STATE_SHOULD_DIE, + SYNC_STATE_ALIVE, + SYNC_STATE_SHOULD_ONLINE, + SYNC_STATE_ONLINE, +}; + +#ifdef CONFIG_HOTPLUG_CORE_SYNC +/** + * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown + * @state: The synchronization state to set + * + * No synchronization point. Just update of the synchronization state, but implies + * a full barrier so that the AP changes are visible before the control CPU proceeds. + */ +static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) +{ + atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state); + + (void)atomic_xchg(st, state); +} + +void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); } + +static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state, + enum cpuhp_sync_state next_state) +{ + atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); + ktime_t now, end, start = ktime_get(); + int sync; + + end = start + 10ULL * NSEC_PER_SEC; + + sync = atomic_read(st); + while (1) { + if (sync == state) { + if (!atomic_try_cmpxchg(st, &sync, next_state)) + continue; + return true; + } + + now = ktime_get(); + if (now > end) { + /* Timeout. Leave the state unchanged */ + return false; + } else if (now - start < NSEC_PER_MSEC) { + /* Poll for one millisecond */ + arch_cpuhp_sync_state_poll(); + } else { + usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE); + } + sync = atomic_read(st); + } + return true; +} +#else /* CONFIG_HOTPLUG_CORE_SYNC */ +static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { } +#endif /* !CONFIG_HOTPLUG_CORE_SYNC */ + +#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD +/** + * cpuhp_ap_report_dead - Update synchronization state to DEAD + * + * No synchronization point. Just update of the synchronization state. + */ +void cpuhp_ap_report_dead(void) +{ + cpuhp_ap_update_sync_state(SYNC_STATE_DEAD); +} + +void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { } + +/* + * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down + * because the AP cannot issue complete() at this stage. + */ +static void cpuhp_bp_sync_dead(unsigned int cpu) +{ + atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); + int sync = atomic_read(st); + + do { + /* CPU can have reported dead already. Don't overwrite that! */ + if (sync == SYNC_STATE_DEAD) + break; + } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE)); + + if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) { + /* CPU reached dead state. Invoke the cleanup function */ + arch_cpuhp_cleanup_dead_cpu(cpu); + return; + } + + /* No further action possible. Emit message and give up. */ + pr_err("CPU%u failed to report dead state\n", cpu); +} +#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */ +static inline void cpuhp_bp_sync_dead(unsigned int cpu) { } +#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */ + +#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL +/** + * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive + * + * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits + * for the BP to release it. + */ +void cpuhp_ap_sync_alive(void) +{ + atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state); + + cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE); + + /* Wait for the control CPU to release it. */ + while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE) + cpu_relax(); +} + +static bool cpuhp_can_boot_ap(unsigned int cpu) +{ + atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); + int sync = atomic_read(st); + +again: + switch (sync) { + case SYNC_STATE_DEAD: + /* CPU is properly dead */ + break; + case SYNC_STATE_KICKED: + /* CPU did not come up in previous attempt */ + break; + case SYNC_STATE_ALIVE: + /* CPU is stuck cpuhp_ap_sync_alive(). */ + break; + default: + /* CPU failed to report online or dead and is in limbo state. */ + return false; + } + + /* Prepare for booting */ + if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED)) + goto again; + + return true; +} + +void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { } + +/* + * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up + * because the AP cannot issue complete() so early in the bringup. + */ +static int cpuhp_bp_sync_alive(unsigned int cpu) +{ + int ret = 0; + + if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL)) + return 0; + + if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) { + pr_err("CPU%u failed to report alive state\n", cpu); + ret = -EIO; + } + + /* Let the architecture cleanup the kick alive mechanics. */ + arch_cpuhp_cleanup_kick_cpu(cpu); + return ret; +} +#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */ +static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; } +static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; } +#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */ + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; @@ -588,6 +767,9 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + if (!cpuhp_can_boot_ap(cpu)) + return -EAGAIN; + /* * Reset stale stack state from the last time this CPU was online. */ @@ -610,6 +792,10 @@ static int bringup_cpu(unsigned int cpu) if (ret) goto out_unlock; + ret = cpuhp_bp_sync_alive(cpu); + if (ret) + goto out_unlock; + ret = bringup_wait_for_ap_online(cpu); if (ret) goto out_unlock; @@ -1113,6 +1299,8 @@ static int takedown_cpu(unsigned int cpu) /* This actually kills the CPU. */ __cpu_die(cpu); + cpuhp_bp_sync_dead(cpu); + tick_cleanup_dead_cpu(cpu); rcutree_migrate_callbacks(cpu); return 0; @@ -1359,8 +1547,10 @@ void cpuhp_online_idle(enum cpuhp_state state) if (state != CPUHP_AP_ONLINE_IDLE) return; + cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE); + /* - * Unpart the stopper thread before we start the idle loop (and start + * Unpark the stopper thread before we start the idle loop (and start * scheduling); this ensures the stopper task is always available. */ stop_machine_unpark(smp_processor_id()); @@ -2737,6 +2927,7 @@ void __init boot_cpu_hotplug_init(void) { #ifdef CONFIG_SMP cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); + atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 2c7396da470c..3dcfd3f04ed0 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -326,6 +326,7 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); +#ifndef CONFIG_HOTPLUG_CORE_SYNC static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* @@ -488,3 +489,4 @@ bool cpu_report_death(void) } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +#endif /* !CONFIG_HOTPLUG_CORE_SYNC */ -- cgit v1.2.3 From 5356297d12d9ee6f70d09485878904bc41bac422 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 May 2023 23:07:30 +0200 Subject: cpu/hotplug: Remove cpu_report_state() and related unused cruft No more users. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley Tested-by: Oleksandr Natalenko Tested-by: Helge Deller # parisc Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://lore.kernel.org/r/20230512205256.582584351@linutronix.de --- kernel/smpboot.c | 90 -------------------------------------------------------- 1 file changed, 90 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 3dcfd3f04ed0..1940f33a40a3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -329,97 +329,7 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); #ifndef CONFIG_HOTPLUG_CORE_SYNC static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); -/* - * Called to poll specified CPU's state, for example, when waiting for - * a CPU to come online. - */ -int cpu_report_state(int cpu) -{ - return atomic_read(&per_cpu(cpu_hotplug_state, cpu)); -} - -/* - * If CPU has died properly, set its state to CPU_UP_PREPARE and - * return success. Otherwise, return -EBUSY if the CPU died after - * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN - * if cpu_wait_death() timed out and the CPU still hasn't gotten around - * to dying. In the latter two cases, the CPU might not be set up - * properly, but it is up to the arch-specific code to decide. - * Finally, -EIO indicates an unanticipated problem. - * - * Note that it is permissible to omit this call entirely, as is - * done in architectures that do no CPU-hotplug error checking. - */ -int cpu_check_up_prepare(int cpu) -{ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) { - atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); - return 0; - } - - switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) { - - case CPU_POST_DEAD: - - /* The CPU died properly, so just start it up again. */ - atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); - return 0; - - case CPU_DEAD_FROZEN: - - /* - * Timeout during CPU death, so let caller know. - * The outgoing CPU completed its processing, but after - * cpu_wait_death() timed out and reported the error. The - * caller is free to proceed, in which case the state - * will be reset properly by cpu_set_state_online(). - * Proceeding despite this -EBUSY return makes sense - * for systems where the outgoing CPUs take themselves - * offline, with no post-death manipulation required from - * a surviving CPU. - */ - return -EBUSY; - - case CPU_BROKEN: - - /* - * The most likely reason we got here is that there was - * a timeout during CPU death, and the outgoing CPU never - * did complete its processing. This could happen on - * a virtualized system if the outgoing VCPU gets preempted - * for more than five seconds, and the user attempts to - * immediately online that same CPU. Trying again later - * might return -EBUSY above, hence -EAGAIN. - */ - return -EAGAIN; - - case CPU_UP_PREPARE: - /* - * Timeout while waiting for the CPU to show up. Allow to try - * again later. - */ - return 0; - - default: - - /* Should not happen. Famous last words. */ - return -EIO; - } -} - -/* - * Mark the specified CPU online. - * - * Note that it is permissible to omit this call entirely, as is - * done in architectures that do no CPU-hotplug error checking. - */ -void cpu_set_state_online(int cpu) -{ - (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE); -} - #ifdef CONFIG_HOTPLUG_CPU - /* * Wait for the specified CPU to exit the idle loop and die. */ -- cgit v1.2.3 From bc088f9a0d5bdf12bb18980739336dfcc092e55b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 May 2023 23:07:41 +0200 Subject: cpu/hotplug: Remove unused state functions All users converted to the hotplug core mechanism. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley Tested-by: Oleksandr Natalenko Tested-by: Helge Deller # parisc Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://lore.kernel.org/r/20230512205256.972894276@linutronix.de --- kernel/smpboot.c | 75 -------------------------------------------------------- 1 file changed, 75 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1940f33a40a3..f47d8f375946 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -325,78 +325,3 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) cpus_read_unlock(); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); - -#ifndef CONFIG_HOTPLUG_CORE_SYNC -static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); - -#ifdef CONFIG_HOTPLUG_CPU -/* - * Wait for the specified CPU to exit the idle loop and die. - */ -bool cpu_wait_death(unsigned int cpu, int seconds) -{ - int jf_left = seconds * HZ; - int oldstate; - bool ret = true; - int sleep_jf = 1; - - might_sleep(); - - /* The outgoing CPU will normally get done quite quickly. */ - if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) - goto update_state_early; - udelay(5); - - /* But if the outgoing CPU dawdles, wait increasingly long times. */ - while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) { - schedule_timeout_uninterruptible(sleep_jf); - jf_left -= sleep_jf; - if (jf_left <= 0) - break; - sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); - } -update_state_early: - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); -update_state: - if (oldstate == CPU_DEAD) { - /* Outgoing CPU died normally, update state. */ - smp_mb(); /* atomic_read() before update. */ - atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); - } else { - /* Outgoing CPU still hasn't died, set state accordingly. */ - if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - &oldstate, CPU_BROKEN)) - goto update_state; - ret = false; - } - return ret; -} - -/* - * Called by the outgoing CPU to report its successful death. Return - * false if this report follows the surviving CPU's timing out. - * - * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU - * timed out. This approach allows architectures to omit calls to - * cpu_check_up_prepare() and cpu_set_state_online() without defeating - * the next cpu_wait_death()'s polling loop. - */ -bool cpu_report_death(void) -{ - int oldstate; - int newstate; - int cpu = smp_processor_id(); - - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); - do { - if (oldstate != CPU_BROKEN) - newstate = CPU_DEAD; - else - newstate = CPU_DEAD_FROZEN; - } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - &oldstate, newstate)); - return newstate == CPU_DEAD; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -#endif /* !CONFIG_HOTPLUG_CORE_SYNC */ -- cgit v1.2.3 From 6d712b9b3a58018259fb40ddd498d1f7dfa1f4ec Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 May 2023 23:07:43 +0200 Subject: cpu/hotplug: Reset task stack state in _cpu_up() Commit dce1ca0525bf ("sched/scs: Reset task stack state in bringup_cpu()") ensured that the shadow call stack and KASAN poisoning were removed from a CPU's stack each time that CPU is brought up, not just once. This is not incorrect. However, with parallel bringup the idle thread setup will happen at a different step. As a consequence the cleanup in bringup_cpu() would be too late. Move the SCS/KASAN cleanup to the generic _cpu_up() function instead, which already ensures that the new CPU's stack is available, purely to allow for early failure. This occurs when the CPU to be brought up is in the CPUHP_OFFLINE state, which should correctly do the cleanup any time the CPU has been taken down to the point where such is needed. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Tested-by: Mark Rutland Tested-by: Michael Kelley Tested-by: Oleksandr Natalenko Tested-by: Helge Deller # parisc Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://lore.kernel.org/r/20230512205257.027075560@linutronix.de --- kernel/cpu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 64b624291316..0ab6a7d430c6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -770,12 +770,6 @@ static int bringup_cpu(unsigned int cpu) if (!cpuhp_can_boot_ap(cpu)) return -EAGAIN; - /* - * Reset stale stack state from the last time this CPU was online. - */ - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); - /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. @@ -1587,6 +1581,12 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) ret = PTR_ERR(idle); goto out; } + + /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); } cpuhp_tasks_frozen = tasks_frozen; -- cgit v1.2.3 From a631be92b996c5db9b368e8b96305d22fb8c4180 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 May 2023 23:07:45 +0200 Subject: cpu/hotplug: Provide a split up CPUHP_BRINGUP mechanism The bring up logic of a to be onlined CPU consists of several parts, which are considered to be a single hotplug state: 1) Control CPU issues the wake-up 2) To be onlined CPU starts up, does the minimal initialization, reports to be alive and waits for release into the complete bring-up. 3) Control CPU waits for the alive report and releases the upcoming CPU for the complete bring-up. Allow to split this into two states: 1) Control CPU issues the wake-up After that the to be onlined CPU starts up, does the minimal initialization, reports to be alive and waits for release into the full bring-up. As this can run after the control CPU dropped the hotplug locks the code which is executed on the AP before it reports alive has to be carefully audited to not violate any of the hotplug constraints, especially not modifying any of the various cpumasks. This is really only meant to avoid waiting for the AP to react on the wake-up. Of course an architecture can move strict CPU related setup functionality, e.g. microcode loading, with care before the synchronization point to save further pointless waiting time. 2) Control CPU waits for the alive report and releases the upcoming CPU for the complete bring-up. This allows that the two states can be split up to run all to be onlined CPUs up to state #1 on the control CPU and then at a later point run state #2. This spares some of the latencies of the full serialized per CPU bringup by avoiding the per CPU wakeup/wait serialization. The assumption is that the first AP already waits when the last AP has been woken up. This obvioulsy depends on the hardware latencies and depending on the timings this might still not completely eliminate all wait scenarios. This split is just a preparatory step for enabling the parallel bringup later. The boot time bringup is still fully serialized. It has a separate config switch so that architectures which want to support parallel bringup can test the split of the CPUHP_BRINGUG step separately. To enable this the architecture must support the CPU hotplug core sync mechanism and has to be audited that there are no implicit hotplug state dependencies which require a fully serialized bringup. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley Tested-by: Oleksandr Natalenko Tested-by: Helge Deller # parisc Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://lore.kernel.org/r/20230512205257.080801387@linutronix.de --- kernel/cpu.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0ab6a7d430c6..d2487aa4e7c8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -761,6 +761,47 @@ static int bringup_wait_for_ap_online(unsigned int cpu) return 0; } +#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP +static int cpuhp_kick_ap_alive(unsigned int cpu) +{ + if (!cpuhp_can_boot_ap(cpu)) + return -EAGAIN; + + return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu)); +} + +static int cpuhp_bringup_ap(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret; + + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. + * Prevent irq alloc/free across the bringup. + */ + irq_lock_sparse(); + + ret = cpuhp_bp_sync_alive(cpu); + if (ret) + goto out_unlock; + + ret = bringup_wait_for_ap_online(cpu); + if (ret) + goto out_unlock; + + irq_unlock_sparse(); + + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(cpu, st, st->target); + +out_unlock: + irq_unlock_sparse(); + return ret; +} +#else static int bringup_cpu(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -781,7 +822,6 @@ static int bringup_cpu(unsigned int cpu) */ irq_lock_sparse(); - /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); if (ret) goto out_unlock; @@ -805,6 +845,7 @@ out_unlock: irq_unlock_sparse(); return ret; } +#endif static int finish_cpu(unsigned int cpu) { @@ -1944,13 +1985,38 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, - /* Kicks the plugged cpu into life */ + +#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP + /* + * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until + * the next step will release it. + */ + [CPUHP_BP_KICK_AP] = { + .name = "cpu:kick_ap", + .startup.single = cpuhp_kick_ap_alive, + }, + + /* + * Waits for the AP to reach cpuhp_ap_sync_alive() and then + * releases it for the complete bringup. + */ + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup.single = cpuhp_bringup_ap, + .teardown.single = finish_cpu, + .cant_stop = true, + }, +#else + /* + * All-in-one CPU bringup state which includes the kick alive. + */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, .teardown.single = finish_cpu, .cant_stop = true, }, +#endif /* Final state before CPU kills itself */ [CPUHP_AP_IDLE_DEAD] = { .name = "idle:dead", -- cgit v1.2.3 From 18415f33e2ac4ab382cbca8b5ff82a9036b5bd49 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 May 2023 23:07:50 +0200 Subject: cpu/hotplug: Allow "parallel" bringup up to CPUHP_BP_KICK_AP_STATE There is often significant latency in the early stages of CPU bringup, and time is wasted by waking each CPU (e.g. with SIPI/INIT/INIT on x86) and then waiting for it to respond before moving on to the next. Allow a platform to enable parallel setup which brings all to be onlined CPUs up to the CPUHP_BP_KICK_AP state. While this state advancement on the control CPU (BP) is single-threaded the important part is the last state CPUHP_BP_KICK_AP which wakes the to be onlined CPUs up. This allows the CPUs to run up to the first sychronization point cpuhp_ap_sync_alive() where they wait for the control CPU to release them one by one for the full onlining procedure. This parallelism depends on the CPU hotplug core sync mechanism which ensures that the parallel brought up CPUs wait for release before touching any state which would make the CPU visible to anything outside the hotplug control mechanism. To handle the SMT constraints of X86 correctly the bringup happens in two iterations when CONFIG_HOTPLUG_SMT is enabled. The control CPU brings up the primary SMT threads of each core first, which can load the microcode without the need to rendevouz with the thread siblings. Once that's completed it brings up the secondary SMT threads. Co-developed-by: David Woodhouse Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley Tested-by: Oleksandr Natalenko Tested-by: Helge Deller # parisc Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://lore.kernel.org/r/20230512205257.240231377@linutronix.de --- kernel/cpu.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 98 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d2487aa4e7c8..005f863a3d2b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -649,8 +649,23 @@ bool cpu_smt_possible(void) cpu_smt_control != CPU_SMT_NOT_SUPPORTED; } EXPORT_SYMBOL_GPL(cpu_smt_possible); + +static inline bool cpuhp_smt_aware(void) +{ + return topology_smt_supported(); +} + +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_primary_thread_mask; +} #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +static inline bool cpuhp_smt_aware(void) { return false; } +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_present_mask; +} #endif static inline enum cpuhp_state @@ -1747,18 +1762,96 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu) return 0; } -void __init bringup_nonboot_cpus(unsigned int setup_max_cpus) +static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus, + enum cpuhp_state target) { unsigned int cpu; - for_each_present_cpu(cpu) { - if (num_online_cpus() >= setup_max_cpus) + for_each_cpu(cpu, mask) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + if (!--ncpus) break; - if (!cpu_online(cpu)) - cpu_up(cpu, CPUHP_ONLINE); + + if (cpu_up(cpu, target) && can_rollback_cpu(st)) { + /* + * If this failed then cpu_up() might have only + * rolled back to CPUHP_BP_KICK_AP for the final + * online. Clean it up. NOOP if already rolled back. + */ + WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE)); + } } } +#ifdef CONFIG_HOTPLUG_PARALLEL +static bool __cpuhp_parallel_bringup __ro_after_init = true; + +static int __init parallel_bringup_parse_param(char *arg) +{ + return kstrtobool(arg, &__cpuhp_parallel_bringup); +} +early_param("cpuhp.parallel", parallel_bringup_parse_param); + +/* + * On architectures which have enabled parallel bringup this invokes all BP + * prepare states for each of the to be onlined APs first. The last state + * sends the startup IPI to the APs. The APs proceed through the low level + * bringup code in parallel and then wait for the control CPU to release + * them one by one for the final onlining procedure. + * + * This avoids waiting for each AP to respond to the startup IPI in + * CPUHP_BRINGUP_CPU. + */ +static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus) +{ + const struct cpumask *mask = cpu_present_mask; + + if (__cpuhp_parallel_bringup) + __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup(); + if (!__cpuhp_parallel_bringup) + return false; + + if (cpuhp_smt_aware()) { + const struct cpumask *pmask = cpuhp_get_primary_thread_mask(); + static struct cpumask tmp_mask __initdata; + + /* + * X86 requires to prevent that SMT siblings stopped while + * the primary thread does a microcode update for various + * reasons. Bring the primary threads up first. + */ + cpumask_and(&tmp_mask, mask, pmask); + cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP); + cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE); + /* Account for the online CPUs */ + ncpus -= num_online_cpus(); + if (!ncpus) + return true; + /* Create the mask for secondary CPUs */ + cpumask_andnot(&tmp_mask, mask, pmask); + mask = &tmp_mask; + } + + /* Bring the not-yet started CPUs up */ + cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP); + cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE); + return true; +} +#else +static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; } +#endif /* CONFIG_HOTPLUG_PARALLEL */ + +void __init bringup_nonboot_cpus(unsigned int setup_max_cpus) +{ + /* Try parallel bringup optimization if enabled */ + if (cpuhp_bringup_cpus_parallel(setup_max_cpus)) + return; + + /* Full per CPU serialized bringup */ + cpuhp_bringup_mask(cpu_present_mask, setup_max_cpus, CPUHP_ONLINE); +} + #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -- cgit v1.2.3 From 4d585f48ee6b38c54c075b151c5efd2ff65f8ffd Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Wed, 10 May 2023 14:30:47 -0700 Subject: bpf: Remove anonymous union in bpf_kfunc_call_arg_meta For kfuncs like bpf_obj_drop and bpf_refcount_acquire - which take user-defined types as input - the verifier needs to track the specific type passed in when checking a particular kfunc call. This requires tracking (btf, btf_id) tuple. In commit 7c50b1cb76ac ("bpf: Add bpf_refcount_acquire kfunc") I added an anonymous union with inner structs named after the specific kfuncs tracking this information, with the goal of making it more obvious which kfunc this data was being tracked / expected to be tracked on behalf of. In a recent series adding a new user of this tuple, Alexei mentioned that he didn't like this union usage as it doesn't really help with readability or bug-proofing ([0]). In an offline convo we agreed to have the tuple be fields (arg_btf, arg_btf_id), with comments in bpf_kfunc_call_arg_meta definition enumerating the uses of the fields by kfunc-specific handling logic. Such a pattern is used by struct bpf_reg_state without trouble. Accordingly, this patch removes the anonymous union in favor of arg_btf and arg_btf_id fields and comment enumerating their current uses. The patch also removes struct btf_and_id, which was only being used by the removed union's inner structs. This is a mechanical change, existing linked_list and rbtree tests will validate that correct (btf, btf_id) are being passed. [0]: https://lore.kernel.org/bpf/20230505021707.vlyiwy57vwxglbka@dhcp-172-26-102-232.dhcp.thefacebook.com Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230510213047.1633612-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 754129d41225..5c636276d907 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -279,11 +279,6 @@ struct bpf_call_arg_meta { struct btf_field *kptr_field; }; -struct btf_and_id { - struct btf *btf; - u32 btf_id; -}; - struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -302,10 +297,18 @@ struct bpf_kfunc_call_arg_meta { u64 value; bool found; } arg_constant; - union { - struct btf_and_id arg_obj_drop; - struct btf_and_id arg_refcount_acquire; - }; + + /* arg_btf and arg_btf_id are used by kfunc-specific handling, + * generally to pass info about user-defined local kptr types to later + * verification logic + * bpf_obj_drop + * Record the local kptr type to be drop'd + * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) + * Record the local kptr type to be refcount_incr'd + */ + struct btf *arg_btf; + u32 arg_btf_id; + struct { struct btf_field *field; } arg_list_head; @@ -10680,8 +10683,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (meta->btf == btf_vmlinux && meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { - meta->arg_obj_drop.btf = reg->btf; - meta->arg_obj_drop.btf_id = reg->btf_id; + meta->arg_btf = reg->btf; + meta->arg_btf_id = reg->btf_id; } break; case KF_ARG_PTR_TO_DYNPTR: @@ -10892,8 +10895,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "bpf_refcount_acquire calls are disabled for now\n"); return -EINVAL; } - meta->arg_refcount_acquire.btf = reg->btf; - meta->arg_refcount_acquire.btf_id = reg->btf_id; + meta->arg_btf = reg->btf; + meta->arg_btf_id = reg->btf_id; break; } } @@ -11125,12 +11128,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf; - regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id; + regs[BPF_REG_0].btf = meta.arg_btf; + regs[BPF_REG_0].btf_id = meta.arg_btf_id; insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_refcount_acquire.btf, - meta.arg_refcount_acquire.btf_id); + btf_find_struct_meta(meta.arg_btf, + meta.arg_btf_id); } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { struct btf_field *field = meta.arg_list_head.field; @@ -11260,8 +11263,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_obj_drop.btf, - meta.arg_obj_drop.btf_id); + btf_find_struct_meta(meta.arg_btf, + meta.arg_btf_id); } } } -- cgit v1.2.3 From d84b1a6708eec06b6cd9d33c5e0177bbd6ba4813 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 May 2023 11:07:10 -0700 Subject: bpf: fix calculation of subseq_idx during precision backtracking Subsequent instruction index (subseq_idx) is an index of an instruction that was verified/executed by verifier after the currently processed instruction. It is maintained during precision backtracking processing and is used to detect various subprog calling conditions. This patch fixes the bug with incorrectly resetting subseq_idx to -1 when going from child state to parent state during backtracking. If we don't maintain correct subseq_idx we can misidentify subprog calls leading to precision tracking bugs. One such case was triggered by test_global_funcs/global_func9 test where global subprog call happened to be the very last instruction in parent state, leading to subseq_idx==-1, triggering WARN_ONCE: [ 36.045754] verifier backtracking bug [ 36.045764] WARNING: CPU: 13 PID: 2073 at kernel/bpf/verifier.c:3503 __mark_chain_precision+0xcc6/0xde0 [ 36.046819] Modules linked in: aesni_intel(E) crypto_simd(E) cryptd(E) kvm_intel(E) kvm(E) irqbypass(E) i2c_piix4(E) serio_raw(E) i2c_core(E) crc32c_intel) [ 36.048040] CPU: 13 PID: 2073 Comm: test_progs Tainted: G W OE 6.3.0-07976-g4d585f48ee6b-dirty #972 [ 36.048783] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 36.049648] RIP: 0010:__mark_chain_precision+0xcc6/0xde0 [ 36.050038] Code: 3d 82 c6 05 bb 35 32 02 01 e8 66 21 ec ff 0f 0b b8 f2 ff ff ff e9 30 f5 ff ff 48 c7 c7 f3 61 3d 82 4c 89 0c 24 e8 4a 21 ec ff <0f> 0b 4c0 With the fix precision tracking across multiple states works correctly now: mark_precise: frame0: last_idx 45 first_idx 38 subseq_idx -1 mark_precise: frame0: regs=r8 stack= before 44: (61) r7 = *(u32 *)(r10 -4) mark_precise: frame0: regs=r8 stack= before 43: (85) call pc+41 mark_precise: frame0: regs=r8 stack= before 42: (07) r1 += -48 mark_precise: frame0: regs=r8 stack= before 41: (bf) r1 = r10 mark_precise: frame0: regs=r8 stack= before 40: (63) *(u32 *)(r10 -48) = r1 mark_precise: frame0: regs=r8 stack= before 39: (b4) w1 = 0 mark_precise: frame0: regs=r8 stack= before 38: (85) call pc+38 mark_precise: frame0: parent state regs=r8 stack=: R0_w=scalar() R1_w=map_value(off=4,ks=4,vs=8,imm=0) R6=1 R7_w=scalar() R8_r=P0 R10=fpm mark_precise: frame0: last_idx 36 first_idx 28 subseq_idx 38 mark_precise: frame0: regs=r8 stack= before 36: (18) r1 = 0xffff888104f2ed14 mark_precise: frame0: regs=r8 stack= before 35: (85) call pc+33 mark_precise: frame0: regs=r8 stack= before 33: (18) r1 = 0xffff888104f2ed10 mark_precise: frame0: regs=r8 stack= before 32: (85) call pc+36 mark_precise: frame0: regs=r8 stack= before 31: (07) r1 += -4 mark_precise: frame0: regs=r8 stack= before 30: (bf) r1 = r10 mark_precise: frame0: regs=r8 stack= before 29: (63) *(u32 *)(r10 -4) = r7 mark_precise: frame0: regs=r8 stack= before 28: (4c) w7 |= w0 mark_precise: frame0: parent state regs=r8 stack=: R0_rw=scalar() R6=1 R7_rw=scalar() R8_rw=P0 R10=fp0 fp-48_r=mmmmmmmm mark_precise: frame0: last_idx 27 first_idx 16 subseq_idx 28 mark_precise: frame0: regs=r8 stack= before 27: (85) call pc+31 mark_precise: frame0: regs=r8 stack= before 26: (b7) r1 = 0 mark_precise: frame0: regs=r8 stack= before 25: (b7) r8 = 0 Note how subseq_idx starts out as -1, then is preserved as 38 and then 28 as we go up the parent state chain. Reported-by: Alexei Starovoitov Fixes: fde2a3882bd0 ("bpf: support precision propagation in the presence of subprogs") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230515180710.1535018-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c636276d907..f597491259ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3864,10 +3864,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; int last_idx = env->insn_idx; + int subseq_idx = -1; struct bpf_func_state *func; struct bpf_reg_state *reg; bool skip_first = true; - int i, prev_i, fr, err; + int i, fr, err; if (!env->bpf_capable) return 0; @@ -3897,8 +3898,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d\n", - bt->frame, last_idx, first_idx); + verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", + bt->frame, last_idx, first_idx, subseq_idx); } if (last_idx < 0) { @@ -3930,12 +3931,12 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) return -EFAULT; } - for (i = last_idx, prev_i = -1;;) { + for (i = last_idx;;) { if (skip_first) { err = 0; skip_first = false; } else { - err = backtrack_insn(env, i, prev_i, bt); + err = backtrack_insn(env, i, subseq_idx, bt); } if (err == -ENOTSUPP) { mark_all_scalars_precise(env, env->cur_state); @@ -3952,7 +3953,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; if (i == first_idx) break; - prev_i = i; + subseq_idx = i; i = get_prev_insn_idx(st, i, &history); if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 @@ -4031,6 +4032,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) if (bt_empty(bt)) return 0; + subseq_idx = first_idx; last_idx = st->last_insn_idx; first_idx = st->first_insn_idx; } -- cgit v1.2.3 From 47e79cbeea4b3891ad476047f4c68543eb51c8e0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 15 May 2023 13:08:48 +0000 Subject: bpf: Remove bpf trampoline selector After commit e21aa341785c ("bpf: Fix fexit trampoline."), the selector is only used to indicate how many times the bpf trampoline image are updated and been displayed in the trampoline ksym name. After the trampoline is freed, the selector will start from 0 again. So the selector is a useless value to the user. We can remove it. If the user want to check whether the bpf trampoline image has been updated or not, the user can compare the address. Each time the trampoline image is updated, the address will change consequently. Jiri also pointed out another issue that perf is still using the old name "bpf_trampoline_%lu", so this change can fix the issue in perf. Fixes: e21aa341785c ("bpf: Fix fexit trampoline.") Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Acked-by: Song Liu Cc: Jiri Olsa Link: https://lore.kernel.org/bpf/ZFvOOlrmHiY9AgXE@krava Link: https://lore.kernel.org/bpf/20230515130849.57502-3-laoar.shao@gmail.com --- kernel/bpf/trampoline.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ac021bc43a66..84850e66ce3d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -344,7 +344,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); } -static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) { struct bpf_tramp_image *im; struct bpf_ksym *ksym; @@ -371,7 +371,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); - snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx); + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); bpf_image_ksym_add(image, ksym); return im; @@ -401,11 +401,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut err = unregister_fentry(tr, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; - tr->selector = 0; goto out; } - im = bpf_tramp_image_alloc(tr->key, tr->selector); + im = bpf_tramp_image_alloc(tr->key); if (IS_ERR(im)) { err = PTR_ERR(im); goto out; @@ -442,8 +441,7 @@ again: set_memory_rox((long)im->image, 1); - WARN_ON(tr->cur_image && tr->selector == 0); - WARN_ON(!tr->cur_image && tr->selector); + WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); @@ -473,7 +471,6 @@ again: if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); tr->cur_image = im; - tr->selector++; out: /* If any error happens, restore previous flags */ if (err) -- cgit v1.2.3 From 108598c39eefbedc9882273ac0df96127a629220 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 15 May 2023 13:08:47 +0000 Subject: bpf: Fix memleak due to fentry attach failure If it fails to attach fentry, the allocated bpf trampoline image will be left in the system. That can be verified by checking /proc/kallsyms. This meamleak can be verified by a simple bpf program as follows: SEC("fentry/trap_init") int fentry_run() { return 0; } It will fail to attach trap_init because this function is freed after kernel init, and then we can find the trampoline image is left in the system by checking /proc/kallsyms. $ tail /proc/kallsyms ffffffffc0613000 t bpf_trampoline_6442453466_1 [bpf] ffffffffc06c3000 t bpf_trampoline_6442453466_1 [bpf] $ bpftool btf dump file /sys/kernel/btf/vmlinux | grep "FUNC 'trap_init'" [2522] FUNC 'trap_init' type_id=119 linkage=static $ echo $((6442453466 & 0x7fffffff)) 2522 Note that there are two left bpf trampoline images, that is because the libbpf will fallback to raw tracepoint if -EINVAL is returned. Fixes: e21aa341785c ("bpf: Fix fexit trampoline.") Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Acked-by: Song Liu Cc: Jiri Olsa Link: https://lore.kernel.org/bpf/20230515130849.57502-2-laoar.shao@gmail.com --- kernel/bpf/trampoline.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 84850e66ce3d..78acf28d4873 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -251,11 +251,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a return tlinks; } -static void __bpf_tramp_image_put_deferred(struct work_struct *work) +static void bpf_tramp_image_free(struct bpf_tramp_image *im) { - struct bpf_tramp_image *im; - - im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); bpf_jit_uncharge_modmem(PAGE_SIZE); @@ -263,6 +260,14 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) kfree_rcu(im, rcu); } +static void __bpf_tramp_image_put_deferred(struct work_struct *work) +{ + struct bpf_tramp_image *im; + + im = container_of(work, struct bpf_tramp_image, work); + bpf_tramp_image_free(im); +} + /* callback, fexit step 3 or fentry step 2 */ static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) { @@ -437,7 +442,7 @@ again: &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) - goto out; + goto out_free; set_memory_rox((long)im->image, 1); @@ -466,7 +471,7 @@ again: } #endif if (err) - goto out; + goto out_free; if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); @@ -477,6 +482,10 @@ out: tr->flags = orig_flags; kfree(tlinks); return err; + +out_free: + bpf_tramp_image_free(im); + goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) -- cgit v1.2.3 From cff36398bd4c7d322d424433db437f3c3391c491 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 16 May 2023 11:04:09 -0700 Subject: bpf: drop unnecessary user-triggerable WARN_ONCE in verifierl log It's trivial for user to trigger "verifier log line truncated" warning, as verifier has a fixed-sized buffer of 1024 bytes (as of now), and there are at least two pieces of user-provided information that can be output through this buffer, and both can be arbitrarily sized by user: - BTF names; - BTF.ext source code lines strings. Verifier log buffer should be properly sized for typical verifier state output. But it's sort-of expected that this buffer won't be long enough in some circumstances. So let's drop the check. In any case code will work correctly, at worst truncating a part of a single line output. Reported-by: syzbot+8b2a08dfbd25fd933d75@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230516180409.3549088-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 046ddff37a76..850494423530 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -62,9 +62,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, - "verifier log line truncated - local buffer too short\n"); - if (log->level == BPF_LOG_KERNEL) { bool newline = n > 0 && log->kbuf[n - 1] == '\n'; -- cgit v1.2.3 From e455ca40dbcf2cd50d1e59bf4b2752b300bcdad4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 15:10:52 +0200 Subject: audit: avoid missing-prototype warnings Building with 'make W=1' reveals two function definitions without a previous prototype in the audit code: lib/compat_audit.c:32:5: error: no previous prototype for 'audit_classify_compat_syscall' [-Werror=missing-prototypes] kernel/audit.c:1813:14: error: no previous prototype for 'audit_serial' [-Werror=missing-prototypes] The first one needs a declaration from linux/audit.h but cannot include that header without causing conflicting (compat) syscall number definitions, so move the it into linux/audit_arch.h. The second one is declared conditionally based on CONFIG_AUDITSYSCALL but needed as a local function even when that option is disabled, so move the declaration out of the #ifdef block. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Moore --- kernel/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index c57b008b9914..94738bce40b2 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -259,8 +259,8 @@ extern struct tty_struct *audit_get_tty(void); extern void audit_put_tty(struct tty_struct *tty); /* audit watch/mark/tree functions */ -#ifdef CONFIG_AUDITSYSCALL extern unsigned int audit_serial(void); +#ifdef CONFIG_AUDITSYSCALL extern int auditsc_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial); -- cgit v1.2.3 From 725e8ec59c56c65fb92e343c10a8842cd0d4f194 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 May 2023 17:02:08 -1000 Subject: workqueue: Add pwq->stats[] and a monitoring script Currently, the only way to peer into workqueue operations is through tracing. While possible, it isn't easy or convenient to monitor per-workqueue behaviors over time this way. Let's add pwq->stats[] that track relevant events and a drgn monitoring script - tools/workqueue/wq_monitor.py. It's arguable whether this needs to be configurable. However, it currently only has several counters and the runtime overhead shouldn't be noticeable given that they're on pwq's which are per-cpu on per-cpu workqueues and per-numa-node on unbound ones. Let's keep it simple for the time being. v2: Patch reordered to earlier with fewer fields. Field will be added back gradually. Help message improved. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 36bccc1285b3..60d5b84cccb2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -199,6 +199,20 @@ struct worker_pool { struct rcu_head rcu; }; +/* + * Per-pool_workqueue statistics. These can be monitored using + * tools/workqueue/wq_monitor.py. + */ +enum pool_workqueue_stats { + PWQ_STAT_STARTED, /* work items started execution */ + PWQ_STAT_COMPLETED, /* work items completed execution */ + PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ + PWQ_STAT_MAYDAY, /* maydays to rescuer */ + PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ + + PWQ_NR_STATS, +}; + /* * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS * of work_struct->data are used for flags and the remaining high bits @@ -236,6 +250,8 @@ struct pool_workqueue { struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ + u64 stats[PWQ_NR_STATS]; + /* * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue @@ -929,8 +945,10 @@ void wq_worker_sleeping(struct task_struct *task) } pool->nr_running--; - if (need_more_worker(pool)) + if (need_more_worker(pool)) { + worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; wake_up_worker(pool); + } raw_spin_unlock_irq(&pool->lock); } @@ -2165,6 +2183,7 @@ static void send_mayday(struct work_struct *work) get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); + pwq->stats[PWQ_STAT_MAYDAY]++; } } @@ -2403,6 +2422,7 @@ __acquires(&pool->lock) * workqueues), so hiding them isn't a problem. */ lockdep_invariant_state(true); + pwq->stats[PWQ_STAT_STARTED]++; trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2410,6 +2430,7 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); + pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); @@ -2653,6 +2674,7 @@ repeat: if (first) pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + pwq->stats[PWQ_STAT_RESCUED]++; } first = false; } -- cgit v1.2.3 From 3a46c9833c1fad3b4a91bbbeb856810c7e1d8e47 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 May 2023 17:02:08 -1000 Subject: workqueue: Re-order struct worker fields struct worker was laid out with the intent that all fields that are modified for each work item execution are in the first cacheline. However, this hasn't been true for a while with the addition of ->last_func. Let's just collect hot fields together at the top. Move ->sleeping in the hole after ->current_color and move ->lst_func right below. While at it, drop the cacheline comment which isn't useful anymore. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue_internal.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index e00b1204a8e9..0600f04ceeb2 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -32,9 +32,12 @@ struct worker { work_func_t current_func; /* L: current_work's fn */ struct pool_workqueue *current_pwq; /* L: current_work's pwq */ unsigned int current_color; /* L: current_work's color */ - struct list_head scheduled; /* L: scheduled works */ + int sleeping; /* None */ + + /* used by the scheduler to determine a worker's last known identity */ + work_func_t last_func; /* L: last work's fn */ - /* 64 bytes boundary on 64bit, 32 on 32bit */ + struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* A: the associated pool */ @@ -45,7 +48,6 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - int sleeping; /* None */ /* * Opaque string set with work_set_desc(). Printed out with task @@ -55,9 +57,6 @@ struct worker { /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ - - /* used by the scheduler to determine a worker's last known identity */ - work_func_t last_func; }; /** -- cgit v1.2.3 From c54d5046a06b90adb3d1188f0741a88692854354 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 May 2023 17:02:08 -1000 Subject: workqueue: Move worker_set/clr_flags() upwards They are going to be used in wq_worker_stopping(). Move them upwards. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 108 ++++++++++++++++++++++++++--------------------------- 1 file changed, 54 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 60d5b84cccb2..d70bb5be99ce 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -879,6 +879,60 @@ static void wake_up_worker(struct worker_pool *pool) wake_up_process(worker->task); } +/** + * worker_set_flags - set worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to set + * + * Set @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock) + */ +static inline void worker_set_flags(struct worker *worker, unsigned int flags) +{ + struct worker_pool *pool = worker->pool; + + WARN_ON_ONCE(worker->task != current); + + /* If transitioning into NOT_RUNNING, adjust nr_running. */ + if ((flags & WORKER_NOT_RUNNING) && + !(worker->flags & WORKER_NOT_RUNNING)) { + pool->nr_running--; + } + + worker->flags |= flags; +} + +/** + * worker_clr_flags - clear worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to clear + * + * Clear @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock) + */ +static inline void worker_clr_flags(struct worker *worker, unsigned int flags) +{ + struct worker_pool *pool = worker->pool; + unsigned int oflags = worker->flags; + + WARN_ON_ONCE(worker->task != current); + + worker->flags &= ~flags; + + /* + * If transitioning out of NOT_RUNNING, increment nr_running. Note + * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask + * of multiple flags, not a single flag. + */ + if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) + pool->nr_running++; +} + /** * wq_worker_running - a worker is running again * @task: task waking up @@ -983,60 +1037,6 @@ work_func_t wq_worker_last_func(struct task_struct *task) return worker->last_func; } -/** - * worker_set_flags - set worker flags and adjust nr_running accordingly - * @worker: self - * @flags: flags to set - * - * Set @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) - */ -static inline void worker_set_flags(struct worker *worker, unsigned int flags) -{ - struct worker_pool *pool = worker->pool; - - WARN_ON_ONCE(worker->task != current); - - /* If transitioning into NOT_RUNNING, adjust nr_running. */ - if ((flags & WORKER_NOT_RUNNING) && - !(worker->flags & WORKER_NOT_RUNNING)) { - pool->nr_running--; - } - - worker->flags |= flags; -} - -/** - * worker_clr_flags - clear worker flags and adjust nr_running accordingly - * @worker: self - * @flags: flags to clear - * - * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) - */ -static inline void worker_clr_flags(struct worker *worker, unsigned int flags) -{ - struct worker_pool *pool = worker->pool; - unsigned int oflags = worker->flags; - - WARN_ON_ONCE(worker->task != current); - - worker->flags &= ~flags; - - /* - * If transitioning out of NOT_RUNNING, increment nr_running. Note - * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask - * of multiple flags, not a single flag. - */ - if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) - if (!(worker->flags & WORKER_NOT_RUNNING)) - pool->nr_running++; -} - /** * find_worker_executing_work - find worker which is executing a work * @pool: pool of interest -- cgit v1.2.3 From bdf8b9bfc131864f0fcef268b34123acfb6a1b59 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 May 2023 17:02:08 -1000 Subject: workqueue: Improve locking rule description for worker fields * Some worker fields are modified only by the worker itself while holding pool->lock thus making them safe to read from self, IRQ context if the CPU is running the worker or while holding pool->lock. Add 'K' locking rule for them. * worker->sleeping is currently marked "None" which isn't very descriptive. It's used only by the worker itself. Add 'S' locking rule for it. A future patch will depend on the 'K' rule to access worker->current_* from the scheduler ticks. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 ++++++ kernel/workqueue_internal.h | 15 ++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d70bb5be99ce..942421443603 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -126,6 +126,12 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * + * K: Only modified by worker while holding pool->lock. Can be safely read by + * self, while holding pool->lock or from IRQ context if %current is the + * kworker. + * + * S: Only modified by worker self. + * * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 0600f04ceeb2..c2455be7b4c2 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -28,14 +28,15 @@ struct worker { struct hlist_node hentry; /* L: while busy */ }; - struct work_struct *current_work; /* L: work being processed */ - work_func_t current_func; /* L: current_work's fn */ - struct pool_workqueue *current_pwq; /* L: current_work's pwq */ - unsigned int current_color; /* L: current_work's color */ - int sleeping; /* None */ + struct work_struct *current_work; /* K: work being processed and its */ + work_func_t current_func; /* K: function */ + struct pool_workqueue *current_pwq; /* K: pwq */ + unsigned int current_color; /* K: color */ + + int sleeping; /* S: is worker sleeping? */ /* used by the scheduler to determine a worker's last known identity */ - work_func_t last_func; /* L: last work's fn */ + work_func_t last_func; /* K: last work's fn */ struct list_head scheduled; /* L: scheduled works */ @@ -45,7 +46,7 @@ struct worker { struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ - unsigned long last_active; /* L: last active timestamp */ + unsigned long last_active; /* K: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ -- cgit v1.2.3 From 616db8779b1e3f93075df691432cccc5ef3c3ba0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 May 2023 17:02:08 -1000 Subject: workqueue: Automatically mark CPU-hogging work items CPU_INTENSIVE If a per-cpu work item hogs the CPU, it can prevent other work items from starting through concurrency management. A per-cpu workqueue which intends to host such CPU-hogging work items can choose to not participate in concurrency management by setting %WQ_CPU_INTENSIVE; however, this can be error-prone and difficult to debug when missed. This patch adds an automatic CPU usage based detection. If a concurrency-managed work item consumes more CPU time than the threshold (10ms by default) continuously without intervening sleeps, wq_worker_tick() which is called from scheduler_tick() will detect the condition and automatically mark it CPU_INTENSIVE. The mechanism isn't foolproof: * Detection depends on tick hitting the work item. Getting preempted at the right timings may allow a violating work item to evade detection at least temporarily. * nohz_full CPUs may not be running ticks and thus can fail detection. * Even when detection is working, the 10ms detection delays can add up if many CPU-hogging work items are queued at the same time. However, in vast majority of cases, this should be able to detect violations reliably and provide reasonable protection with a small increase in code complexity. If some work items trigger this condition repeatedly, the bigger problem likely is the CPU being saturated with such per-cpu work items and the solution would be making them UNBOUND. The next patch will add a debug mechanism to help spot such cases. v4: Documentation for workqueue.cpu_intensive_thresh_us added to kernel-parameters.txt. v3: Switch to use wq_worker_tick() instead of hooking into preemptions as suggested by Peter. v2: Lai pointed out that wq_worker_stopping() also needs to be called from preemption and rtlock paths and an earlier patch was updated accordingly. This patch adds a comment describing the risk of infinte recursions and how they're avoided. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Lai Jiangshan --- kernel/sched/core.c | 3 ++ kernel/workqueue.c | 68 +++++++++++++++++++++++++++++++++++++++++---- kernel/workqueue_internal.h | 2 ++ 3 files changed, 68 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 944c3ae39861..3484cada9a4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5632,6 +5632,9 @@ void scheduler_tick(void) perf_event_task_tick(); + if (curr->flags & PF_WQ_WORKER) + wq_worker_tick(curr); + #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 942421443603..3dc83d5eba50 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -212,6 +212,7 @@ struct worker_pool { enum pool_workqueue_stats { PWQ_STAT_STARTED, /* work items started execution */ PWQ_STAT_COMPLETED, /* work items completed execution */ + PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ @@ -332,6 +333,14 @@ static struct kmem_cache *pwq_cache; static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ +/* + * Per-cpu work items which run for longer than the following threshold are + * automatically considered CPU intensive and excluded from concurrency + * management to prevent them from noticeably delaying other per-cpu work items. + */ +static unsigned long wq_cpu_intensive_thresh_us = 10000; +module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); + static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); @@ -962,6 +971,13 @@ void wq_worker_running(struct task_struct *task) if (!(worker->flags & WORKER_NOT_RUNNING)) worker->pool->nr_running++; preempt_enable(); + + /* + * CPU intensive auto-detection cares about how long a work item hogged + * CPU without sleeping. Reset the starting timestamp on wakeup. + */ + worker->current_at = worker->task->se.sum_exec_runtime; + worker->sleeping = 0; } @@ -1012,6 +1028,45 @@ void wq_worker_sleeping(struct task_struct *task) raw_spin_unlock_irq(&pool->lock); } +/** + * wq_worker_tick - a scheduler tick occurred while a kworker is running + * @task: task currently running + * + * Called from scheduler_tick(). We're in the IRQ context and the current + * worker's fields which follow the 'K' locking rule can be accessed safely. + */ +void wq_worker_tick(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + struct pool_workqueue *pwq = worker->current_pwq; + struct worker_pool *pool = worker->pool; + + if (!pwq) + return; + + /* + * If the current worker is concurrency managed and hogged the CPU for + * longer than wq_cpu_intensive_thresh_us, it's automatically marked + * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. + */ + if ((worker->flags & WORKER_NOT_RUNNING) || + worker->task->se.sum_exec_runtime - worker->current_at < + wq_cpu_intensive_thresh_us * NSEC_PER_USEC) + return; + + raw_spin_lock(&pool->lock); + + worker_set_flags(worker, WORKER_CPU_INTENSIVE); + pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; + + if (need_more_worker(pool)) { + pwq->stats[PWQ_STAT_CM_WAKEUP]++; + wake_up_worker(pool); + } + + raw_spin_unlock(&pool->lock); +} + /** * wq_worker_last_func - retrieve worker's last work function * @task: Task to retrieve last work function of. @@ -2327,7 +2382,6 @@ __acquires(&pool->lock) { struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; - bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; unsigned long work_data; struct worker *collision; #ifdef CONFIG_LOCKDEP @@ -2364,6 +2418,7 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; + worker->current_at = worker->task->se.sum_exec_runtime; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); @@ -2381,7 +2436,7 @@ __acquires(&pool->lock) * of concurrency management and the next code block will chain * execution of the pending work items. */ - if (unlikely(cpu_intensive)) + if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* @@ -2461,9 +2516,12 @@ __acquires(&pool->lock) raw_spin_lock_irq(&pool->lock); - /* clear cpu intensive status */ - if (unlikely(cpu_intensive)) - worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + /* + * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked + * CPU intensive by wq_worker_tick() if @work hogged CPU longer than + * wq_cpu_intensive_thresh_us. Clear it. + */ + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* tag the worker for identification in schedule() */ worker->last_func = worker->current_func; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index c2455be7b4c2..6b1d66e28269 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -31,6 +31,7 @@ struct worker { struct work_struct *current_work; /* K: work being processed and its */ work_func_t current_func; /* K: function */ struct pool_workqueue *current_pwq; /* K: pwq */ + u64 current_at; /* K: runtime at start or last wakeup */ unsigned int current_color; /* K: color */ int sleeping; /* S: is worker sleeping? */ @@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void) */ void wq_worker_running(struct task_struct *task); void wq_worker_sleeping(struct task_struct *task); +void wq_worker_tick(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ -- cgit v1.2.3 From 6363845005202148b8409ec3082e80845c19d309 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 May 2023 17:02:08 -1000 Subject: workqueue: Report work funcs that trigger automatic CPU_INTENSIVE mechanism Workqueue now automatically marks per-cpu work items that hog CPU for too long as CPU_INTENSIVE, which excludes them from concurrency management and prevents stalling other concurrency-managed work items. If a work function keeps running over the thershold, it likely needs to be switched to use an unbound workqueue. This patch adds a debug mechanism which tracks the work functions which trigger the automatic CPU_INTENSIVE mechanism and report them using pr_warn() with exponential backoff. v3: Documentation update. v2: Drop bouncing to kthread_worker for printing messages. It was to avoid introducing circular locking dependency through printk but not effective as it still had pool lock -> wci_lock -> printk -> pool lock loop. Let's just print directly using printk_deferred(). Signed-off-by: Tejun Heo Suggested-by: Peter Zijlstra --- kernel/workqueue.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3dc83d5eba50..4ca66384d288 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -948,6 +948,98 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) pool->nr_running++; } +#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT + +/* + * Concurrency-managed per-cpu work items that hog CPU for longer than + * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, + * which prevents them from stalling other concurrency-managed work items. If a + * work function keeps triggering this mechanism, it's likely that the work item + * should be using an unbound workqueue instead. + * + * wq_cpu_intensive_report() tracks work functions which trigger such conditions + * and report them so that they can be examined and converted to use unbound + * workqueues as appropriate. To avoid flooding the console, each violating work + * function is tracked and reported with exponential backoff. + */ +#define WCI_MAX_ENTS 128 + +struct wci_ent { + work_func_t func; + atomic64_t cnt; + struct hlist_node hash_node; +}; + +static struct wci_ent wci_ents[WCI_MAX_ENTS]; +static int wci_nr_ents; +static DEFINE_RAW_SPINLOCK(wci_lock); +static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); + +static struct wci_ent *wci_find_ent(work_func_t func) +{ + struct wci_ent *ent; + + hash_for_each_possible_rcu(wci_hash, ent, hash_node, + (unsigned long)func) { + if (ent->func == func) + return ent; + } + return NULL; +} + +static void wq_cpu_intensive_report(work_func_t func) +{ + struct wci_ent *ent; + +restart: + ent = wci_find_ent(func); + if (ent) { + u64 cnt; + + /* + * Start reporting from the fourth time and back off + * exponentially. + */ + cnt = atomic64_inc_return_relaxed(&ent->cnt); + if (cnt >= 4 && is_power_of_2(cnt)) + printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", + ent->func, wq_cpu_intensive_thresh_us, + atomic64_read(&ent->cnt)); + return; + } + + /* + * @func is a new violation. Allocate a new entry for it. If wcn_ents[] + * is exhausted, something went really wrong and we probably made enough + * noise already. + */ + if (wci_nr_ents >= WCI_MAX_ENTS) + return; + + raw_spin_lock(&wci_lock); + + if (wci_nr_ents >= WCI_MAX_ENTS) { + raw_spin_unlock(&wci_lock); + return; + } + + if (wci_find_ent(func)) { + raw_spin_unlock(&wci_lock); + goto restart; + } + + ent = &wci_ents[wci_nr_ents++]; + ent->func = func; + atomic64_set(&ent->cnt, 1); + hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); + + raw_spin_unlock(&wci_lock); +} + +#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ +static void wq_cpu_intensive_report(work_func_t func) {} +#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ + /** * wq_worker_running - a worker is running again * @task: task waking up @@ -1057,6 +1149,7 @@ void wq_worker_tick(struct task_struct *task) raw_spin_lock(&pool->lock); worker_set_flags(worker, WORKER_CPU_INTENSIVE); + wq_cpu_intensive_report(worker->current_func); pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; if (need_more_worker(pool)) { -- cgit v1.2.3 From 8a1dd1e547c1a037692e7a6da6a76108108c72b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 May 2023 17:02:09 -1000 Subject: workqueue: Track and monitor per-workqueue CPU time usage Now that wq_worker_tick() is there, we can easily track the rough CPU time consumption of each workqueue by charging the whole tick whenever a tick hits an active workqueue. While not super accurate, it provides reasonable visibility into the workqueues that consume a lot of CPU cycles. wq_monitor.py is updated to report the per-workqueue CPU times. v2: wq_monitor.py was using "cputime" as the key when outputting in json format. Use "cpu_time" instead for consistency with other fields. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4ca66384d288..ee16ddb0647c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -212,6 +212,7 @@ struct worker_pool { enum pool_workqueue_stats { PWQ_STAT_STARTED, /* work items started execution */ PWQ_STAT_COMPLETED, /* work items completed execution */ + PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ @@ -1136,6 +1137,8 @@ void wq_worker_tick(struct task_struct *task) if (!pwq) return; + pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; + /* * If the current worker is concurrency managed and hogged the CPU for * longer than wq_cpu_intensive_thresh_us, it's automatically marked -- cgit v1.2.3 From 8a3e82d38674066f4cbed3588b78b0d9b8b15ed7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 21:35:42 +0200 Subject: x86/hibernate: Declare global functions in suspend.h Three functions that are defined in x86 specific code to override generic __weak implementations cause a warning because of a missing prototype: arch/x86/power/cpu.c:298:5: error: no previous prototype for 'hibernate_resume_nonboot_cpu_disable' [-Werror=missing-prototypes] arch/x86/power/hibernate.c:129:5: error: no previous prototype for 'arch_hibernation_header_restore' [-Werror=missing-prototypes] arch/x86/power/hibernate.c:91:5: error: no previous prototype for 'arch_hibernation_header_save' [-Werror=missing-prototypes] Move the declarations into a global header so it can be included by any file defining one of these. Signed-off-by: Arnd Bergmann Signed-off-by: Dave Hansen Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/all/20230516193549.544673-14-arnd%40kernel.org --- kernel/power/power.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index b83c8d5e188d..a6a16faf0ead 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -26,9 +26,6 @@ extern void __init hibernate_image_size_init(void); /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) -extern int arch_hibernation_header_save(void *addr, unsigned int max_size); -extern int arch_hibernation_header_restore(void *addr); - static inline int init_header_complete(struct swsusp_info *info) { return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); @@ -41,8 +38,6 @@ static inline const char *check_image_kernel(struct swsusp_info *info) } #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ -extern int hibernate_resume_nonboot_cpu_disable(void); - /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] -- cgit v1.2.3 From eb1cfd09f788e39948a82be8063e54e40dd018d9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 May 2023 15:58:46 -0400 Subject: lockdep: Add lock_set_cmp_fn() annotation This implements a new interface to lockdep, lock_set_cmp_fn(), for defining a custom ordering when taking multiple locks of the same class. This is an alternative to subclasses, but can not fully replace them since subclasses allow lock hierarchies with other clasees inter-twined, while this relies on pure class nesting. Specifically, if A is our nesting class then: A/0 <- B <- A/1 Would be a valid lock order with subclasses (each subclass really is a full class from the validation PoV) but not with this annotation, which requires all nesting to be consecutive. Example output: | ============================================ | WARNING: possible recursive locking detected | 6.2.0-rc8-00003-g7d81e591ca6a-dirty #15 Not tainted | -------------------------------------------- | kworker/14:3/938 is trying to acquire lock: | ffff8880143218c8 (&b->lock l=0 0:2803368){++++}-{3:3}, at: bch_btree_node_get.part.0+0x81/0x2b0 | | but task is already holding lock: | ffff8880143de8c8 (&b->lock l=1 1048575:9223372036854775807){++++}-{3:3}, at: __bch_btree_map_nodes+0xea/0x1e0 | and the lock comparison function returns 1: | | other info that might help us debug this: | Possible unsafe locking scenario: | | CPU0 | ---- | lock(&b->lock l=1 1048575:9223372036854775807); | lock(&b->lock l=0 0:2803368); | | *** DEADLOCK *** | | May be due to missing lock nesting notation | | 3 locks held by kworker/14:3/938: | #0: ffff888005ea9d38 ((wq_completion)bcache){+.+.}-{0:0}, at: process_one_work+0x1ec/0x530 | #1: ffff8880098c3e70 ((work_completion)(&cl->work)#3){+.+.}-{0:0}, at: process_one_work+0x1ec/0x530 | #2: ffff8880143de8c8 (&b->lock l=1 1048575:9223372036854775807){++++}-{3:3}, at: __bch_btree_map_nodes+0xea/0x1e0 [peterz: extended changelog] Signed-off-by: Kent Overstreet Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230509195847.1745548-1-kent.overstreet@linux.dev --- kernel/locking/lockdep.c | 118 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 87 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index dcd1d5bfc1e0..3e8950f3beab 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -709,7 +709,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static void __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct held_lock *hlock, struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; @@ -724,17 +724,19 @@ static void __print_lock_name(struct lock_class *class) printk(KERN_CONT "#%d", class->name_version); if (class->subclass) printk(KERN_CONT "/%d", class->subclass); + if (hlock && class->print_fn) + class->print_fn(hlock->instance); } } -static void print_lock_name(struct lock_class *class) +static void print_lock_name(struct held_lock *hlock, struct lock_class *class) { char usage[LOCK_USAGE_CHARS]; get_usage_chars(class, usage); printk(KERN_CONT " ("); - __print_lock_name(class); + __print_lock_name(hlock, class); printk(KERN_CONT "){%s}-{%d:%d}", usage, class->wait_type_outer ?: class->wait_type_inner, class->wait_type_inner); @@ -772,7 +774,7 @@ static void print_lock(struct held_lock *hlock) } printk(KERN_CONT "%px", hlock->instance); - print_lock_name(lock); + print_lock_name(hlock, lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -1868,7 +1870,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) if (debug_locks_silent) return; printk("\n-> #%u", depth); - print_lock_name(target->class); + print_lock_name(NULL, target->class); printk(KERN_CONT ":\n"); print_lock_trace(target->trace, 6); } @@ -1899,11 +1901,11 @@ print_circular_lock_scenario(struct held_lock *src, */ if (parent != source) { printk("Chain exists of:\n "); - __print_lock_name(source); + __print_lock_name(src, source); printk(KERN_CONT " --> "); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT " --> "); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT "\n\n"); } @@ -1914,13 +1916,13 @@ print_circular_lock_scenario(struct held_lock *src, printk(" rlock("); else printk(" lock("); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); if (src_read != 0) printk(" rlock("); @@ -1928,7 +1930,7 @@ print_circular_lock_scenario(struct held_lock *src, printk(" sync("); else printk(" lock("); - __print_lock_name(source); + __print_lock_name(src, source); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2154,6 +2156,8 @@ check_path(struct held_lock *target, struct lock_list *src_entry, return ret; } +static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *); + /* * Prove that the dependency graph starting at can not * lead to . If it can, there is a circle when adding @@ -2185,7 +2189,10 @@ check_noncircular(struct held_lock *src, struct held_lock *target, *trace = save_trace(); } - print_circular_bug(&src_entry, target_entry, src, target); + if (src->class_idx == target->class_idx) + print_deadlock_bug(current, src, target); + else + print_circular_bug(&src_entry, target_entry, src, target); } return ret; @@ -2341,7 +2348,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) int bit; printk("%*s->", depth, ""); - print_lock_name(class); + print_lock_name(NULL, class); #ifdef CONFIG_DEBUG_LOCKDEP printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); #endif @@ -2523,11 +2530,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry, */ if (middle_class != unsafe_class) { printk("Chain exists of:\n "); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT " --> "); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT " --> "); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT "\n\n"); } @@ -2535,18 +2542,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); printk(" lock("); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT ");\n"); printk(" local_irq_disable();\n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT ");\n"); printk(" \n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2583,20 +2590,20 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("\nand this task is already holding:\n"); print_lock(prev); pr_warn("which would create a new lock dependency:\n"); - print_lock_name(hlock_class(prev)); + print_lock_name(prev, hlock_class(prev)); pr_cont(" ->"); - print_lock_name(hlock_class(next)); + print_lock_name(next, hlock_class(next)); pr_cont("\n"); pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); - print_lock_name(backwards_entry->class); + print_lock_name(NULL, backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_entry->class); + print_lock_name(NULL, forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); @@ -2966,10 +2973,10 @@ print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(prev); + __print_lock_name(prv, prev); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(next); + __print_lock_name(nxt, next); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); printk(" May be due to missing lock nesting notation\n\n"); @@ -2979,6 +2986,8 @@ static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { + struct lock_class *class = hlock_class(prev); + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; @@ -2993,6 +3002,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); + if (class->cmp_fn) { + pr_warn("and the lock comparison function returns %i:\n", + class->cmp_fn(prev->instance, next->instance)); + } + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); @@ -3014,6 +3028,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, static int check_deadlock(struct task_struct *curr, struct held_lock *next) { + struct lock_class *class; struct held_lock *prev; struct held_lock *nest = NULL; int i; @@ -3034,6 +3049,12 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) if ((next->read == 2) && prev->read) continue; + class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + continue; + /* * We're holding the nest_lock, which serializes this lock's * nesting behaviour. @@ -3095,6 +3116,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 2; } + if (prev->class_idx == next->class_idx) { + struct lock_class *class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + return 2; + } + /* * Prove that the new -> dependency would not * create a circular dependency in the graph. (We do this by @@ -3571,7 +3600,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) hlock_id = chain_hlocks[chain->base + i]; chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id)); + print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } @@ -3928,11 +3957,11 @@ static void print_usage_bug_scenario(struct held_lock *lock) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk(" \n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -4018,7 +4047,7 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other->class); + print_lock_name(NULL, other->class); pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); pr_warn("\nother info that might help us debug this:\n"); @@ -4882,6 +4911,33 @@ EXPORT_SYMBOL_GPL(lockdep_init_map_type); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); +#ifdef CONFIG_PROVE_LOCKING +void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn, + lock_print_fn print_fn) +{ + struct lock_class *class = lock->class_cache[0]; + unsigned long flags; + + raw_local_irq_save(flags); + lockdep_recursion_inc(); + + if (!class) + class = register_lock_class(lock, 0, 0); + + if (class) { + WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn); + WARN_ON(class->print_fn && class->print_fn != print_fn); + + class->cmp_fn = cmp_fn; + class->print_fn = print_fn; + } + + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn); +#endif + static void print_lock_nested_lock_not_held(struct task_struct *curr, struct held_lock *hlock) -- cgit v1.2.3 From e859e429511afb21d3f784bd0ccdf500d40b73ef Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 17 May 2023 10:31:25 +0000 Subject: bpf: Show target_{obj,btf}_id in tracing link fdinfo The target_btf_id can help us understand which kernel function is linked by a tracing prog. The target_btf_id and target_obj_id have already been exposed to userspace, so we just need to show them. The result as follows, $ cat /proc/10673/fdinfo/10 pos: 0 flags: 02000000 mnt_id: 15 ino: 2094 link_type: tracing link_id: 2 prog_tag: a04f5eef06a7f555 prog_id: 13 attach_type: 24 target_obj_id: 1 target_btf_id: 13964 Signed-off-by: Yafang Shao Acked-by: Song Liu Link: https://lore.kernel.org/r/20230517103126.68372-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 909c112ef537..b2621089904b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2968,10 +2968,17 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); + u32 target_btf_id, target_obj_id; + bpf_trampoline_unpack_key(tr_link->trampoline->key, + &target_obj_id, &target_btf_id); seq_printf(seq, - "attach_type:\t%d\n", - tr_link->attach_type); + "attach_type:\t%d\n" + "target_obj_id:\t%u\n" + "target_btf_id:\t%u\n", + tr_link->attach_type, + target_obj_id, + target_btf_id); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, -- cgit v1.2.3 From e924e80ee6a39bc28d2ef8f51e19d336a98e3be0 Mon Sep 17 00:00:00 2001 From: Aditi Ghag Date: Fri, 19 May 2023 22:51:54 +0000 Subject: bpf: Add kfunc filter function to 'struct btf_kfunc_id_set' This commit adds the ability to filter kfuncs to certain BPF program types. This is required to limit bpf_sock_destroy kfunc implemented in follow-up commits to programs with attach type 'BPF_TRACE_ITER'. The commit adds a callback filter to 'struct btf_kfunc_id_set'. The filter has access to the `bpf_prog` construct including its properties such as `expected_attached_type`. Signed-off-by: Aditi Ghag Link: https://lore.kernel.org/r/20230519225157.760788-7-aditi.ghag@isovalent.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 65 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/bpf/verifier.c | 7 +++--- 2 files changed, 58 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b682b8e4b50..947f0b83bfad 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -222,10 +222,17 @@ enum btf_kfunc_hook { enum { BTF_KFUNC_SET_MAX_CNT = 256, BTF_DTOR_KFUNC_MAX_CNT = 256, + BTF_KFUNC_FILTER_MAX_CNT = 16, +}; + +struct btf_kfunc_hook_filter { + btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT]; + u32 nr_filters; }; struct btf_kfunc_set_tab { struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX]; + struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX]; }; struct btf_id_dtor_kfunc_tab { @@ -7669,9 +7676,12 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) /* Kernel Function (kfunc) BTF ID set registration API */ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, - struct btf_id_set8 *add_set) + const struct btf_kfunc_id_set *kset) { + struct btf_kfunc_hook_filter *hook_filter; + struct btf_id_set8 *add_set = kset->set; bool vmlinux_set = !btf_is_module(btf); + bool add_filter = !!kset->filter; struct btf_kfunc_set_tab *tab; struct btf_id_set8 *set; u32 set_cnt; @@ -7686,6 +7696,24 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, return 0; tab = btf->kfunc_set_tab; + + if (tab && add_filter) { + u32 i; + + hook_filter = &tab->hook_filters[hook]; + for (i = 0; i < hook_filter->nr_filters; i++) { + if (hook_filter->filters[i] == kset->filter) { + add_filter = false; + break; + } + } + + if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) { + ret = -E2BIG; + goto end; + } + } + if (!tab) { tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN); if (!tab) @@ -7708,7 +7736,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, */ if (!vmlinux_set) { tab->sets[hook] = add_set; - return 0; + goto do_add_filter; } /* In case of vmlinux sets, there may be more than one set being @@ -7750,6 +7778,11 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); +do_add_filter: + if (add_filter) { + hook_filter = &tab->hook_filters[hook]; + hook_filter->filters[hook_filter->nr_filters++] = kset->filter; + } return 0; end: btf_free_kfunc_set_tab(btf); @@ -7758,15 +7791,22 @@ end: static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_hook hook, - u32 kfunc_btf_id) + u32 kfunc_btf_id, + const struct bpf_prog *prog) { + struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *set; - u32 *id; + u32 *id, i; if (hook >= BTF_KFUNC_HOOK_MAX) return NULL; if (!btf->kfunc_set_tab) return NULL; + hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; + for (i = 0; i < hook_filter->nr_filters; i++) { + if (hook_filter->filters[i](prog, kfunc_btf_id)) + return NULL; + } set = btf->kfunc_set_tab->sets[hook]; if (!set) return NULL; @@ -7821,23 +7861,25 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * protection for looking up a well-formed btf->kfunc_set_tab. */ u32 *btf_kfunc_id_set_contains(const struct btf *btf, - enum bpf_prog_type prog_type, - u32 kfunc_btf_id) + u32 kfunc_btf_id, + const struct bpf_prog *prog) { + enum bpf_prog_type prog_type = resolve_prog_type(prog); enum btf_kfunc_hook hook; u32 *kfunc_flags; - kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); + kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog); if (kfunc_flags) return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); + return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog); } -u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id) +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, + const struct bpf_prog *prog) { - return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); + return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog); } static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, @@ -7868,7 +7910,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, goto err_out; } - ret = btf_populate_kfunc_set(btf, hook, kset->set); + ret = btf_populate_kfunc_set(btf, hook, kset); + err_out: btf_put(btf); return ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f597491259ab..af70dad655ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10939,7 +10939,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, *kfunc_name = func_name; func_proto = btf_type_by_id(desc_btf, func->type); - kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id); + kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog); if (!kfunc_flags) { return -EACCES; } @@ -19010,7 +19010,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * in the fmodret id set with the KF_SLEEPABLE flag. */ else { - u32 *flags = btf_kfunc_is_modify_return(btf, btf_id); + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, + prog); if (flags && (*flags & KF_SLEEPABLE)) ret = 0; @@ -19038,7 +19039,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, return -EINVAL; } ret = -EINVAL; - if (btf_kfunc_is_modify_return(btf, btf_id) || + if (btf_kfunc_is_modify_return(btf, btf_id, prog) || !check_attach_modify_return(addr, tname)) ret = 0; if (ret) { -- cgit v1.2.3 From e2a1f85bf9f509afd09b5d3308e3489b65845c28 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 14 May 2023 09:33:38 -0700 Subject: sched/psi: Avoid resetting the min update period when it is unnecessary Psi_group's poll_min_period is determined by the minimum window size of psi_trigger when creating new triggers. While destroying a psi_trigger, there is no need to reset poll_min_period if the psi_trigger being destroyed did not have the minimum window size, since in this condition poll_min_period will remain the same as before. Signed-off-by: Yang Yang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Suren Baghdasaryan Link: https://lkml.kernel.org/r/20230514163338.834345-1-surenb@google.com --- kernel/sched/psi.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index b49af594ad28..81fca77397f6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1407,11 +1407,16 @@ void psi_trigger_destroy(struct psi_trigger *t) group->rtpoll_nr_triggers[t->state]--; if (!group->rtpoll_nr_triggers[t->state]) group->rtpoll_states &= ~(1 << t->state); - /* reset min update period for the remaining triggers */ - list_for_each_entry(tmp, &group->rtpoll_triggers, node) - period = min(period, div_u64(tmp->win.size, - UPDATES_PER_WINDOW)); - group->rtpoll_min_period = period; + /* + * Reset min update period for the remaining triggers + * iff the destroying trigger had the min window size. + */ + if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) { + list_for_each_entry(tmp, &group->rtpoll_triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->rtpoll_min_period = period; + } /* Destroy rtpoll_task when the last trigger is destroyed */ if (group->rtpoll_states == 0) { group->rtpoll_until = 0; -- cgit v1.2.3 From 658888050998d0f0f7d8eca0c4dbb304bfa408bb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 21 May 2023 00:06:34 +0800 Subject: cgroup/cpuset: remove unneeded header files Remove some unnecessary header files. No functional change intended. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2c76fcd9f0bc..b0aee733b92b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -25,25 +25,13 @@ #include #include #include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include #include #include #include -#include -#include -#include -#include -#include #include #include #include @@ -51,18 +39,9 @@ #include #include #include -#include #include -#include -#include -#include -#include -#include -#include #include #include -#include -#include #include #include #include -- cgit v1.2.3 From a495108ea99c64ce6b5727cb163162ba28e27bff Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 11 May 2023 09:27:15 +0800 Subject: capability: fix kernel-doc warnings in capability.c Fix all kernel-doc warnings in capability.c: kernel/capability.c:477: warning: Function parameter or member 'idmap' not described in 'privileged_wrt_inode_uidgid' kernel/capability.c:493: warning: Function parameter or member 'idmap' not described in 'capable_wrt_inode_uidgid' Signed-off-by: Gaosheng Cui Acked-by: Serge Hallyn Signed-off-by: Paul Moore --- kernel/capability.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 3e058f41df32..1a2795102ae4 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -467,6 +467,7 @@ EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question + * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. @@ -481,6 +482,7 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped + * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * @cap: The capability in question * -- cgit v1.2.3 From c33080cdc0cab7e72c5e4841cb7533d18a3130dc Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Wed, 17 May 2023 14:49:10 +0000 Subject: cgroup: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index aeef06c465ef..d55216c4cc2d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -563,7 +563,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), + strscpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); @@ -797,7 +797,7 @@ void cgroup1_release_agent(struct work_struct *work) goto out_free; spin_lock(&release_agent_path_lock); - strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); + strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; -- cgit v1.2.3 From e7d85427ef898afe66c4c1b7e06e5659cec6b640 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 22 May 2023 16:29:14 -0700 Subject: bpf: Validate BPF object in BPF_OBJ_PIN before calling LSM Do a sanity check whether provided file-to-be-pinned is actually a BPF object (prog, map, btf) before calling security_path_mknod LSM hook. If it's not, LSM hook doesn't have to be triggered, as the operation has no chance of succeeding anyways. Suggested-by: Christian Brauner Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Christian Brauner Link: https://lore.kernel.org/bpf/20230522232917.2454595-2-andrii@kernel.org --- kernel/bpf/inode.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9948b542a470..329f27d5cacf 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -448,18 +448,17 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, if (IS_ERR(dentry)) return PTR_ERR(dentry); - mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - - ret = security_path_mknod(&path, dentry, mode, 0); - if (ret) - goto out; - dir = d_inode(path.dentry); if (dir->i_op != &bpf_dir_iops) { ret = -EPERM; goto out; } + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + ret = security_path_mknod(&path, dentry, mode, 0); + if (ret) + goto out; + switch (type) { case BPF_TYPE_PROG: ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); -- cgit v1.2.3 From 06c6796e0304234da65e70577f354cb194086521 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 May 2023 01:12:26 +0200 Subject: cpu/hotplug: Fix off by one in cpuhp_bringup_mask() cpuhp_bringup_mask() iterates over a cpumask and starts all present CPUs up to a caller provided upper limit. The limit variable is decremented and checked for 0 before invoking cpu_up(), which is obviously off by one and prevents the bringup of the last CPU when the limit is equal to the number of present CPUs. Move the decrement and check after the cpu_up() invocation. Fixes: 18415f33e2ac ("cpu/hotplug: Allow "parallel" bringup up to CPUHP_BP_KICK_AP_STATE") Reported-by: Mark Brown Signed-off-by: Thomas Gleixner Tested-by: Mark Brown Link: https://lore.kernel.org/r/87wn10ufj9.ffs@tglx --- kernel/cpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 005f863a3d2b..88a7ede322bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1770,9 +1770,6 @@ static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int n for_each_cpu(cpu, mask) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - if (!--ncpus) - break; - if (cpu_up(cpu, target) && can_rollback_cpu(st)) { /* * If this failed then cpu_up() might have only @@ -1781,6 +1778,9 @@ static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int n */ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE)); } + + if (!--ncpus) + break; } } -- cgit v1.2.3 From cb8edce28073a906401c9e421eca7c99f3396da1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 May 2023 16:48:06 -0700 Subject: bpf: Support O_PATH FDs in BPF_OBJ_PIN and BPF_OBJ_GET commands Current UAPI of BPF_OBJ_PIN and BPF_OBJ_GET commands of bpf() syscall forces users to specify pinning location as a string-based absolute or relative (to current working directory) path. This has various implications related to security (e.g., symlink-based attacks), forces BPF FS to be exposed in the file system, which can cause races with other applications. One of the feedbacks we got from folks working with containers heavily was that inability to use purely FD-based location specification was an unfortunate limitation and hindrance for BPF_OBJ_PIN and BPF_OBJ_GET commands. This patch closes this oversight, adding path_fd field to BPF_OBJ_PIN and BPF_OBJ_GET UAPI, following conventions established by *at() syscalls for dirfd + pathname combinations. This now allows interesting possibilities like working with detached BPF FS mount (e.g., to perform multiple pinnings without running a risk of someone interfering with them), and generally making pinning/getting more secure and not prone to any races and/or security attacks. This is demonstrated by a selftest added in subsequent patch that takes advantage of new mount APIs (fsopen, fsconfig, fsmount) to demonstrate creating detached BPF FS mount, pinning, and then getting BPF map out of it, all while never exposing this private instance of BPF FS to outside worlds. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Christian Brauner Link: https://lore.kernel.org/bpf/20230523170013.728457-4-andrii@kernel.org --- kernel/bpf/inode.c | 16 ++++++++-------- kernel/bpf/syscall.c | 25 ++++++++++++++++++++----- 2 files changed, 28 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 329f27d5cacf..4174f76133df 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -435,7 +435,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, return ret; } -static int bpf_obj_do_pin(const char __user *pathname, void *raw, +static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, enum bpf_type type) { struct dentry *dentry; @@ -444,7 +444,7 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, 0); + dentry = user_path_create(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -477,7 +477,7 @@ out: return ret; } -int bpf_obj_pin_user(u32 ufd, const char __user *pathname) +int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname) { enum bpf_type type; void *raw; @@ -487,14 +487,14 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) if (IS_ERR(raw)) return PTR_ERR(raw); - ret = bpf_obj_do_pin(pathname, raw, type); + ret = bpf_obj_do_pin(path_fd, pathname, raw, type); if (ret != 0) bpf_any_put(raw, type); return ret; } -static void *bpf_obj_do_get(const char __user *pathname, +static void *bpf_obj_do_get(int path_fd, const char __user *pathname, enum bpf_type *type, int flags) { struct inode *inode; @@ -502,7 +502,7 @@ static void *bpf_obj_do_get(const char __user *pathname, void *raw; int ret; - ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); + ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); @@ -526,7 +526,7 @@ out: return ERR_PTR(ret); } -int bpf_obj_get_user(const char __user *pathname, int flags) +int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; int f_flags; @@ -537,7 +537,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) if (f_flags < 0) return f_flags; - raw = bpf_obj_do_get(pathname, &type, f_flags); + raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags); if (IS_ERR(raw)) return PTR_ERR(raw); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b2621089904b..c7f6807215e6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2697,23 +2697,38 @@ free_prog: return err; } -#define BPF_OBJ_LAST_FIELD file_flags +#define BPF_OBJ_LAST_FIELD path_fd static int bpf_obj_pin(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) + int path_fd; + + if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD) + return -EINVAL; + + /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ + if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; - return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); + path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; + return bpf_obj_pin_user(attr->bpf_fd, path_fd, + u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) { + int path_fd; + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || - attr->file_flags & ~BPF_OBJ_FLAG_MASK) + attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD)) + return -EINVAL; + + /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ + if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; - return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), + path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; + return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname), attr->file_flags); } -- cgit v1.2.3 From 2f5edd03ca0d7221a88236b344b84f3fc301b1e3 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 23 May 2023 14:22:19 +0200 Subject: sysctl: Refactor base paths registrations This is part of the general push to deprecate register_sysctl_paths and register_sysctl_table. The old way of doing this through register_sysctl_base and DECLARE_SYSCTL_BASE macro is replaced with a call to register_sysctl_init. The 5 base paths affected are: "kernel", "vm", "debug", "dev" and "fs". We remove the register_sysctl_base function and the DECLARE_SYSCTL_BASE macro since they are no longer needed. In order to quickly acertain that the paths did not actually change I executed `find /proc/sys/ | sha1sum` and made sure that the sha was the same before and after the commit. We end up saving 563 bytes with this change: ./scripts/bloat-o-meter vmlinux.0.base vmlinux.1.refactor-base-paths add/remove: 0/5 grow/shrink: 2/0 up/down: 77/-640 (-563) Function old new delta sysctl_init_bases 55 111 +56 init_fs_sysctls 12 33 +21 vm_base_table 128 - -128 kernel_base_table 128 - -128 fs_base_table 128 - -128 dev_base_table 128 - -128 debug_base_table 128 - -128 Total: Before=21258215, After=21257652, chg -0.00% [mcgrof: modified to use register_sysctl_init() over register_sysctl() and add bloat-o-meter stats] Signed-off-by: Joel Granados Signed-off-by: Luis Chamberlain Tested-by: Stephen Rothwell Acked-by: Christian Brauner --- kernel/sysctl.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bfe53e835524..73fa9cf7ee11 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1782,11 +1782,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_max_threads, }, - { - .procname = "usermodehelper", - .mode = 0555, - .child = usermodehelper_table, - }, { .procname = "overflowuid", .data = &overflowuid, @@ -1962,13 +1957,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_KEYS - { - .procname = "keys", - .mode = 0555, - .child = key_sysctls, - }, -#endif #ifdef CONFIG_PERF_EVENTS /* * User-space scripts rely on the existence of this file @@ -2348,17 +2336,17 @@ static struct ctl_table dev_table[] = { { } }; -DECLARE_SYSCTL_BASE(kernel, kern_table); -DECLARE_SYSCTL_BASE(vm, vm_table); -DECLARE_SYSCTL_BASE(debug, debug_table); -DECLARE_SYSCTL_BASE(dev, dev_table); - int __init sysctl_init_bases(void) { - register_sysctl_base(kernel); - register_sysctl_base(vm); - register_sysctl_base(debug); - register_sysctl_base(dev); + register_sysctl_init("kernel", kern_table); + register_sysctl_init("kernel/usermodehelper", usermodehelper_table); +#ifdef CONFIG_KEYS + register_sysctl_init("kernel/keys", key_sysctls); +#endif + + register_sysctl_init("vm", vm_table); + register_sysctl_init("debug", debug_table); + register_sysctl_init("dev", dev_table); return 0; } -- cgit v1.2.3 From cb0b50b813f6198b7d44ae8e169803440333577a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 9 May 2023 15:49:02 +0200 Subject: module: Remove preempt_disable() from module reference counting. The preempt_disable() section in module_put() was added in commit e1783a240f491 ("module: Use this_cpu_xx to dynamically allocate counters") while the per-CPU counter were switched to another API. The API requires that during the RMW operation the CPU remained the same. This counting API was later replaced with atomic_t in commit 2f35c41f58a97 ("module: Replace module_ref with atomic_t refcnt") Since this atomic_t replacement there is no need to keep preemption disabled while the reference counter is modified. Remove preempt_disable() from module_put(), __module_get() and try_module_get(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 044aa2c9e3cb..ea7d0c7f3e60 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -820,10 +820,8 @@ static struct module_attribute modinfo_refcnt = void __module_get(struct module *module) { if (module) { - preempt_disable(); atomic_inc(&module->refcnt); trace_module_get(module, _RET_IP_); - preempt_enable(); } } EXPORT_SYMBOL(__module_get); @@ -833,15 +831,12 @@ bool try_module_get(struct module *module) bool ret = true; if (module) { - preempt_disable(); /* Note: here, we can fail to get a reference */ if (likely(module_is_live(module) && atomic_inc_not_zero(&module->refcnt) != 0)) trace_module_get(module, _RET_IP_); else ret = false; - - preempt_enable(); } return ret; } @@ -852,11 +847,9 @@ void module_put(struct module *module) int ret; if (module) { - preempt_disable(); ret = atomic_dec_if_positive(&module->refcnt); WARN_ON(ret < 0); /* Failed to put refcount */ trace_module_put(module, _RET_IP_); - preempt_enable(); } } EXPORT_SYMBOL(module_put); -- cgit v1.2.3 From bc06a9e0874239cb6d4eebcb0ecd1a91ad9272db Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 19 May 2023 08:49:00 -0500 Subject: genirq: Use hlist for managing resend handlers The current implementation utilizes a bitmap for managing interrupt resend handlers, which is allocated based on the SPARSE_IRQ/NR_IRQS macros. However, this method may not efficiently utilize memory during runtime, particularly when IRQ_BITMAP_BITS is large. Address this issue by using an hlist to manage interrupt resend handlers instead of relying on a static bitmap memory allocation. Additionally, a new function, clear_irq_resend(), is introduced and called from irq_shutdown to ensure a graceful teardown of the interrupt. Signed-off-by: Shanker Donthineni Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230519134902.1495562-2-sdonthineni@nvidia.com --- kernel/irq/chip.c | 1 + kernel/irq/internals.h | 2 ++ kernel/irq/irqdesc.c | 2 ++ kernel/irq/resend.c | 47 ++++++++++++++++++++++++++++++----------------- 4 files changed, 35 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 49e7bc871fec..2eac5532c3c8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -306,6 +306,7 @@ static void __irq_disable(struct irq_desc *desc, bool mask); void irq_shutdown(struct irq_desc *desc) { if (irqd_is_started(&desc->irq_data)) { + clear_irq_resend(desc); desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) { desc->irq_data.chip->irq_shutdown(&desc->irq_data); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5fdc0b557579..51fc8c497c22 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -113,6 +113,8 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ int check_irq_resend(struct irq_desc *desc, bool inject); +void clear_irq_resend(struct irq_desc *desc); +void irq_resend_init(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 240e145e969f..b401b89b226a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -415,6 +415,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); kobject_init(&desc->kobj, &irq_kobj_type); + irq_resend_init(desc); return desc; @@ -581,6 +582,7 @@ int __init early_irq_init(void) mutex_init(&desc[i].request_mutex); init_waitqueue_head(&desc[i].wait_for_threads); desc_set_defaults(i, &desc[i], node, NULL, NULL); + irq_resend_init(desc); } return arch_early_irq_init(); } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 0c46e9fe3a89..edec335c0a7a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -21,8 +21,9 @@ #ifdef CONFIG_HARDIRQS_SW_RESEND -/* Bitmap to handle software resend of interrupts: */ -static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); +/* hlist_head to handle software resend of interrupts: */ +static HLIST_HEAD(irq_resend_list); +static DEFINE_RAW_SPINLOCK(irq_resend_lock); /* * Run software resends of IRQ's @@ -30,18 +31,17 @@ static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); static void resend_irqs(struct tasklet_struct *unused) { struct irq_desc *desc; - int irq; - - while (!bitmap_empty(irqs_resend, nr_irqs)) { - irq = find_first_bit(irqs_resend, nr_irqs); - clear_bit(irq, irqs_resend); - desc = irq_to_desc(irq); - if (!desc) - continue; - local_irq_disable(); + + raw_spin_lock_irq(&irq_resend_lock); + while (!hlist_empty(&irq_resend_list)) { + desc = hlist_entry(irq_resend_list.first, struct irq_desc, + resend_node); + hlist_del_init(&desc->resend_node); + raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); - local_irq_enable(); + raw_spin_lock(&irq_resend_lock); } + raw_spin_unlock_irq(&irq_resend_lock); } /* Tasklet to handle resend: */ @@ -49,8 +49,6 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { - unsigned int irq = irq_desc_get_irq(desc); - /* * Validate whether this interrupt can be safely injected from * non interrupt context @@ -70,16 +68,31 @@ static int irq_sw_resend(struct irq_desc *desc) */ if (!desc->parent_irq) return -EINVAL; - irq = desc->parent_irq; } - /* Set it pending and activate the softirq: */ - set_bit(irq, irqs_resend); + /* Add to resend_list and activate the softirq: */ + raw_spin_lock(&irq_resend_lock); + hlist_add_head(&desc->resend_node, &irq_resend_list); + raw_spin_unlock(&irq_resend_lock); tasklet_schedule(&resend_tasklet); return 0; } +void clear_irq_resend(struct irq_desc *desc) +{ + raw_spin_lock(&irq_resend_lock); + hlist_del_init(&desc->resend_node); + raw_spin_unlock(&irq_resend_lock); +} + +void irq_resend_init(struct irq_desc *desc) +{ + INIT_HLIST_NODE(&desc->resend_node); +} #else +void clear_irq_resend(struct irq_desc *desc) {} +void irq_resend_init(struct irq_desc *desc) {} + static int irq_sw_resend(struct irq_desc *desc) { return -EINVAL; -- cgit v1.2.3 From 5e630aa8d9fcd4c0cb6d5d09422009533aba979a Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 19 May 2023 08:49:01 -0500 Subject: genirq: Encapsulate sparse bitmap handling Move the open coded sparse bitmap handling into helper functions as a preparatory step for converting the sparse interrupt management to a maple tree. No functional change. Signed-off-by: Shanker Donthineni Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230519134902.1495562-3-sdonthineni@nvidia.com --- kernel/irq/internals.h | 4 ++-- kernel/irq/irqdesc.c | 30 ++++++++++++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 51fc8c497c22..f3f2090dd2de 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -12,9 +12,9 @@ #include #ifdef CONFIG_SPARSE_IRQ -# define IRQ_BITMAP_BITS (NR_IRQS + 8196) +# define MAX_SPARSE_IRQS (NR_IRQS + 8196) #else -# define IRQ_BITMAP_BITS NR_IRQS +# define MAX_SPARSE_IRQS NR_IRQS #endif #define istate core_internal_state__do_not_mess_with_it diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b401b89b226a..e0d9dd9b36f9 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -131,7 +131,18 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); -static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); +static DECLARE_BITMAP(allocated_irqs, MAX_SPARSE_IRQS); + +static int irq_find_free_area(unsigned int from, unsigned int cnt) +{ + return bitmap_find_next_zero_area(allocated_irqs, MAX_SPARSE_IRQS, + from, cnt, 0); +} + +static unsigned int irq_find_at_or_after(unsigned int offset) +{ + return find_next_bit(allocated_irqs, nr_irqs, offset); +} #ifdef CONFIG_SPARSE_IRQ @@ -517,7 +528,7 @@ err: static int irq_expand_nr_irqs(unsigned int nr) { - if (nr > IRQ_BITMAP_BITS) + if (nr > MAX_SPARSE_IRQS) return -ENOMEM; nr_irqs = nr; return 0; @@ -535,11 +546,11 @@ int __init early_irq_init(void) printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", NR_IRQS, nr_irqs, initcnt); - if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) - nr_irqs = IRQ_BITMAP_BITS; + if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS)) + nr_irqs = MAX_SPARSE_IRQS; - if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) - initcnt = IRQ_BITMAP_BITS; + if (WARN_ON(initcnt > MAX_SPARSE_IRQS)) + initcnt = MAX_SPARSE_IRQS; if (initcnt > nr_irqs) nr_irqs = initcnt; @@ -812,8 +823,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, - from, cnt, 0); + start = irq_find_free_area(from, cnt); ret = -EEXIST; if (irq >=0 && start != irq) goto unlock; @@ -834,11 +844,11 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); * irq_get_next_irq - get next allocated irq number * @offset: where to start the search * - * Returns next irq number after offset or nr_irqs if none is found. + * Returns next irq number at or after offset or nr_irqs if none is found. */ unsigned int irq_get_next_irq(unsigned int offset) { - return find_next_bit(allocated_irqs, nr_irqs, offset); + return irq_find_at_or_after(offset); } struct irq_desc * -- cgit v1.2.3 From 721255b9826bd11c7a38b585905fc2dd0fb94e52 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 19 May 2023 08:49:02 -0500 Subject: genirq: Use a maple tree for interrupt descriptor management The current implementation uses a static bitmap for interrupt descriptor allocation and a radix tree to pointer store the pointer for lookup. However, the size of the bitmap is constrained by the build time macro MAX_SPARSE_IRQS, which may not be sufficient to support high-end servers, particularly those with GICv4.1 hardware, which require a large interrupt space to cover LPIs and vSGIs. Replace the bitmap and the radix tree with a maple tree, which not only stores pointers for lookup, but also provides a mechanism to find free ranges. That removes the build time hardcoded upper limit. Signed-off-by: Shanker Donthineni Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230519134902.1495562-4-sdonthineni@nvidia.com --- kernel/irq/internals.h | 2 +- kernel/irq/irqdesc.c | 57 ++++++++++++++++++++++++++++---------------------- 2 files changed, 33 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f3f2090dd2de..7bdb7507efb0 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -12,7 +12,7 @@ #include #ifdef CONFIG_SPARSE_IRQ -# define MAX_SPARSE_IRQS (NR_IRQS + 8196) +# define MAX_SPARSE_IRQS INT_MAX #else # define MAX_SPARSE_IRQS NR_IRQS #endif diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index e0d9dd9b36f9..27ca1c866f29 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include #include @@ -131,17 +130,39 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); -static DECLARE_BITMAP(allocated_irqs, MAX_SPARSE_IRQS); +static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, + MT_FLAGS_ALLOC_RANGE | + MT_FLAGS_LOCK_EXTERN | + MT_FLAGS_USE_RCU, + sparse_irq_lock); static int irq_find_free_area(unsigned int from, unsigned int cnt) { - return bitmap_find_next_zero_area(allocated_irqs, MAX_SPARSE_IRQS, - from, cnt, 0); + MA_STATE(mas, &sparse_irqs, 0, 0); + + if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt)) + return -ENOSPC; + return mas.index; } static unsigned int irq_find_at_or_after(unsigned int offset) { - return find_next_bit(allocated_irqs, nr_irqs, offset); + unsigned long index = offset; + struct irq_desc *desc = mt_find(&sparse_irqs, &index, nr_irqs); + + return desc ? irq_desc_get_irq(desc) : nr_irqs; +} + +static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) +{ + MA_STATE(mas, &sparse_irqs, irq, irq); + WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0); +} + +static void delete_irq_desc(unsigned int irq) +{ + MA_STATE(mas, &sparse_irqs, irq, irq); + mas_erase(&mas); } #ifdef CONFIG_SPARSE_IRQ @@ -355,26 +376,14 @@ static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ -static RADIX_TREE(irq_desc_tree, GFP_KERNEL); - -static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) -{ - radix_tree_insert(&irq_desc_tree, irq, desc); -} - struct irq_desc *irq_to_desc(unsigned int irq) { - return radix_tree_lookup(&irq_desc_tree, irq); + return mtree_load(&sparse_irqs, irq); } #ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif -static void delete_irq_desc(unsigned int irq) -{ - radix_tree_delete(&irq_desc_tree, irq); -} - #ifdef CONFIG_SMP static void free_masks(struct irq_desc *desc) { @@ -517,7 +526,6 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, irq_sysfs_add(start + i, desc); irq_add_debugfs_entry(start + i, desc); } - bitmap_set(allocated_irqs, start, cnt); return start; err: @@ -557,7 +565,6 @@ int __init early_irq_init(void) for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node, 0, NULL, NULL); - set_bit(i, allocated_irqs); irq_insert_desc(i, desc); } return arch_early_irq_init(); @@ -612,6 +619,7 @@ static void free_desc(unsigned int irq) raw_spin_lock_irqsave(&desc->lock, flags); desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); + delete_irq_desc(irq); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, @@ -624,8 +632,8 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, struct irq_desc *desc = irq_to_desc(start + i); desc->owner = owner; + irq_insert_desc(start + i, desc); } - bitmap_set(allocated_irqs, start, cnt); return start; } @@ -637,7 +645,7 @@ static int irq_expand_nr_irqs(unsigned int nr) void irq_mark_irq(unsigned int irq) { mutex_lock(&sparse_irq_lock); - bitmap_set(allocated_irqs, irq, 1); + irq_insert_desc(irq, irq_desc + irq); mutex_unlock(&sparse_irq_lock); } @@ -781,7 +789,6 @@ void irq_free_descs(unsigned int from, unsigned int cnt) for (i = 0; i < cnt; i++) free_desc(from + i); - bitmap_clear(allocated_irqs, from, cnt); mutex_unlock(&sparse_irq_lock); } EXPORT_SYMBOL_GPL(irq_free_descs); @@ -844,7 +851,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); * irq_get_next_irq - get next allocated irq number * @offset: where to start the search * - * Returns next irq number at or after offset or nr_irqs if none is found. + * Returns next irq number after offset or nr_irqs if none is found. */ unsigned int irq_get_next_irq(unsigned int offset) { -- cgit v1.2.3 From 5bd4990f19b0fedbba5511b44c1cd1fdb7061f3c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 14:50:13 +0100 Subject: trace: Convert trace/seq to use copy_splice_read() For the splice from the trace seq buffer, just use copy_splice_read(). In the future, something better can probably be done by gifting pages from seq->buf into the pipe, but that would require changing seq->buf into a vmap over an array of pages. Signed-off-by: David Howells cc: Christoph Hellwig cc: Al Viro cc: Jens Axboe cc: Steven Rostedt cc: Masami Hiramatsu cc: linux-kernel@vger.kernel.org cc: linux-trace-kernel@vger.kernel.org cc: linux-fsdevel@vger.kernel.org cc: linux-block@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20230522135018.2742245-27-dhowells@redhat.com Signed-off-by: Jens Axboe --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebc59781456a..c210d02fac97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5171,7 +5171,7 @@ static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .write = tracing_write_stub, .llseek = tracing_lseek, .release = tracing_release, -- cgit v1.2.3 From 847aea98e01cf084efcb84490b3060af343d1458 Mon Sep 17 00:00:00 2001 From: Wang Honghui Date: Thu, 4 May 2023 17:18:24 +0800 Subject: PM: hibernate: Correct spelling mistake in a comment Fix a typo in a comment in kernel/power/snapshot.c Signed-off-by: Wang Honghui [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cd8b7b35f1e8..b27affb7503f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -398,7 +398,7 @@ struct mem_zone_bm_rtree { unsigned int blocks; /* Number of Bitmap Blocks */ }; -/* strcut bm_position is used for browsing memory bitmaps */ +/* struct bm_position is used for browsing memory bitmaps */ struct bm_position { struct mem_zone_bm_rtree *zone; -- cgit v1.2.3 From c8f6219be2e58d7f676935ae90b64abef5d0966a Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 24 May 2023 11:53:39 +0800 Subject: workqueue: Fix WARN_ON_ONCE() triggers in worker_enter_idle() Currently, pool->nr_running can be modified from timer tick, that means the timer tick can run nested inside a not-irq-protected section that's in the process of modifying nr_running. Consider the following scenario: CPU0 kworker/0:2 (events) worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); ->pool->nr_running++; (1) process_one_work() ->worker->current_func(work); ->schedule() ->wq_worker_sleeping() ->worker->sleeping = 1; ->pool->nr_running--; (0) .... ->wq_worker_running() .... CPU0 by interrupt: wq_worker_tick() ->worker_set_flags(worker, WORKER_CPU_INTENSIVE); ->pool->nr_running--; (-1) ->worker->flags |= WORKER_CPU_INTENSIVE; .... ->if (!(worker->flags & WORKER_NOT_RUNNING)) ->pool->nr_running++; (will not execute) ->worker->sleeping = 0; .... ->worker_clr_flags(worker, WORKER_CPU_INTENSIVE); ->pool->nr_running++; (0) .... worker_set_flags(worker, WORKER_PREP); ->pool->nr_running--; (-1) .... worker_enter_idle() ->WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); if the nr_workers is equal to nr_idle, due to the nr_running is not zero, will trigger WARN_ON_ONCE(). [ 2.460602] WARNING: CPU: 0 PID: 63 at kernel/workqueue.c:1999 worker_enter_idle+0xb2/0xc0 [ 2.462163] Modules linked in: [ 2.463401] CPU: 0 PID: 63 Comm: kworker/0:2 Not tainted 6.4.0-rc2-next-20230519 #1 [ 2.463771] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 [ 2.465127] Workqueue: 0x0 (events) [ 2.465678] RIP: 0010:worker_enter_idle+0xb2/0xc0 ... [ 2.472614] Call Trace: [ 2.473152] [ 2.474182] worker_thread+0x71/0x430 [ 2.474992] ? _raw_spin_unlock_irqrestore+0x28/0x50 [ 2.475263] kthread+0x103/0x120 [ 2.475493] ? __pfx_worker_thread+0x10/0x10 [ 2.476355] ? __pfx_kthread+0x10/0x10 [ 2.476635] ret_from_fork+0x2c/0x50 [ 2.477051] This commit therefore add the check of worker->sleeping in wq_worker_tick(), if the worker->sleeping is not zero, directly return. tj: Updated comment and description. Reported-by: Naresh Kamboju Reported-by: Linux Kernel Functional Testing Tested-by: Anders Roxell Closes: https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20230519/testrun/17078554/suite/boot/test/clang-nightly-lkftconfig/log Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee16ddb0647c..3ad6806c7161 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1051,7 +1051,7 @@ void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); - if (!worker->sleeping) + if (!READ_ONCE(worker->sleeping)) return; /* @@ -1071,7 +1071,7 @@ void wq_worker_running(struct task_struct *task) */ worker->current_at = worker->task->se.sum_exec_runtime; - worker->sleeping = 0; + WRITE_ONCE(worker->sleeping, 0); } /** @@ -1097,10 +1097,10 @@ void wq_worker_sleeping(struct task_struct *task) pool = worker->pool; /* Return if preempted before wq_worker_running() was reached */ - if (worker->sleeping) + if (READ_ONCE(worker->sleeping)) return; - worker->sleeping = 1; + WRITE_ONCE(worker->sleeping, 1); raw_spin_lock_irq(&pool->lock); /* @@ -1143,8 +1143,15 @@ void wq_worker_tick(struct task_struct *task) * If the current worker is concurrency managed and hogged the CPU for * longer than wq_cpu_intensive_thresh_us, it's automatically marked * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. + * + * Set @worker->sleeping means that @worker is in the process of + * switching out voluntarily and won't be contributing to + * @pool->nr_running until it wakes up. As wq_worker_sleeping() also + * decrements ->nr_running, setting CPU_INTENSIVE here can lead to + * double decrements. The task is releasing the CPU anyway. Let's skip. + * We probably want to make this prettier in the future. */ - if ((worker->flags & WORKER_NOT_RUNNING) || + if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || worker->task->se.sum_exec_runtime - worker->current_at < wq_cpu_intensive_thresh_us * NSEC_PER_USEC) return; -- cgit v1.2.3 From 659db0789c2e66c5d6a52d57008e3a7401a3ffff Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 24 May 2023 14:54:31 +0800 Subject: cgroup: Update out-of-date comment in cgroup_migrate() Commit 674b745e22b3 ("cgroup: remove rcu_read_lock()/rcu_read_unlock() in critical section of spin_lock_irq()") has removed the rcu_read_lock, which makes the comment out-of-date, so update it. tj: Updated the comment a bit. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9d809191a54f..f329f49529a2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2872,9 +2872,9 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct task_struct *task; /* - * Prevent freeing of tasks while we take a snapshot. Tasks that are - * already PF_EXITING could be freed from underneath us unless we - * take an rcu_read_lock. + * The following thread iteration should be inside an RCU critical + * section to prevent tasks from being freed while taking the snapshot. + * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); task = leader; -- cgit v1.2.3 From c4c84f6fb2c4dc4c0f5fd927b3c3d3fd28b7030e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 May 2023 15:54:19 -0700 Subject: bpf: drop unnecessary bpf_capable() check in BPF_MAP_FREEZE command Seems like that extra bpf_capable() check in BPF_MAP_FREEZE handler was unintentionally left when we switched to a model that all BPF map operations should be allowed regardless of CAP_BPF (or any other capabilities), as long as process got BPF map FD somehow. This patch replaces bpf_capable() check in BPF_MAP_FREEZE handler with writeable access check, given conceptually freezing the map is modifying it: map becomes unmodifiable for subsequent updates. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230524225421.1587859-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c7f6807215e6..c9a201e4c457 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1931,6 +1931,11 @@ static int map_freeze(const union bpf_attr *attr) return -ENOTSUPP; } + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + mutex_lock(&map->freeze_mutex); if (bpf_map_write_active(map)) { err = -EBUSY; @@ -1940,10 +1945,6 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!bpf_capable()) { - err = -EPERM; - goto err_put; - } WRITE_ONCE(map->frozen, true); err_put: -- cgit v1.2.3 From 18c8ae813156a6855f026de80fffb91e1a28ab3d Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 25 May 2023 12:00:38 +0800 Subject: workqueue: Disable per-cpu CPU hog detection when wq_cpu_intensive_thresh_us is 0 If workqueue.cpu_intensive_thresh_us is set to 0, the detection mechanism for CPU-hogging per-cpu work item will keep triggering spuriously: workqueue: process_srcu hogged CPU for >0us 4 times, consider switching to WQ_UNBOUND workqueue: gc_worker hogged CPU for >0us 4 times, consider switching to WQ_UNBOUND workqueue: gc_worker hogged CPU for >0us 8 times, consider switching to WQ_UNBOUND workqueue: wait_rcu_exp_gp hogged CPU for >0us 4 times, consider switching to WQ_UNBOUND workqueue: kfree_rcu_monitor hogged CPU for >0us 4 times, consider switching to WQ_UNBOUND workqueue: kfree_rcu_monitor hogged CPU for >0us 8 times, consider switching to WQ_UNBOUND workqueue: reg_todo hogged CPU for >0us 4 times, consider switching to WQ_UNBOUND This commit therefore disables the CPU-hog detection mechanism when workqueue.cpu_intensive_thresh_us is set to 0. tj: Patch description updated and the condition check on cpu_intensive_thresh_us separated into a separate if statement for readability. Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3ad6806c7161..41b8388f4284 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1139,6 +1139,9 @@ void wq_worker_tick(struct task_struct *task) pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; + if (!wq_cpu_intensive_thresh_us) + return; + /* * If the current worker is concurrency managed and hogged the CPU for * longer than wq_cpu_intensive_thresh_us, it's automatically marked -- cgit v1.2.3 From 4266f41feaeee2521749ce2cfb52eafd4e2947c5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 May 2023 12:13:56 +0200 Subject: bpf: Fix bad unlock balance on freeze_mutex Commit c4c84f6fb2c4 ("bpf: drop unnecessary bpf_capable() check in BPF_MAP_FREEZE command") moved the permissions check outside of the freeze_mutex in the map_freeze() handler. The error paths still jumps to the err_put which tries to unlock the freeze_mutex even though it was not locked in the first place. Fix it. Fixes: c4c84f6fb2c4 ("bpf: drop unnecessary bpf_capable() check in BPF_MAP_FREEZE command") Reported-by: syzbot+8982e75c2878b9ffeac5@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c9a201e4c457..92a57efc77de 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1932,8 +1932,8 @@ static int map_freeze(const union bpf_attr *attr) } if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { - err = -EPERM; - goto err_put; + fdput(f); + return -EPERM; } mutex_lock(&map->freeze_mutex); -- cgit v1.2.3 From eb07c4f39c3e858a7d0cc4bb15b8a304f83f0497 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 23 May 2023 09:06:34 +0200 Subject: mm/slab: rename CONFIG_SLAB to CONFIG_SLAB_DEPRECATED As discussed at LSF/MM [1] [2] and with no objections raised there, deprecate the SLAB allocator. Rename the user-visible option so that users with CONFIG_SLAB=y get a new prompt with explanation during make oldconfig, while make olddefconfig will just switch to SLUB. In all defconfigs with CONFIG_SLAB=y remove the line so those also switch to SLUB. Regressions due to the switch should be reported to linux-mm and slab maintainers. [1] https://lore.kernel.org/all/4b9fc9c6-b48c-198f-5f80-811a44737e5f@suse.cz/ [2] https://lwn.net/Articles/932201/ Signed-off-by: Vlastimil Babka Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes Acked-by: Geert Uytterhoeven # m68k Acked-by: Helge Deller # parisc --- kernel/configs/tiny.config | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 144b2bd86b14..00009f7d0835 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -6,6 +6,5 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set -# CONFIG_SLAB is not set CONFIG_SLUB=y CONFIG_SLUB_TINY=y -- cgit v1.2.3 From 15d5daa0a7006b9bd4dcc49f90e7ac8ddbe102f2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 15:18:07 +0200 Subject: kallsyms: remove unused arch_get_kallsym() helper The arch_get_kallsym() function was introduced so that x86 could override it, but that override was removed in bf904d2762ee ("x86/pti/64: Remove the SYSCALL64 entry trampoline"), so now this does nothing except causing a warning about a missing prototype: kernel/kallsyms.c:662:12: error: no previous prototype for 'arch_get_kallsym' [-Werror=missing-prototypes] 662 | int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value, Restore the old behavior before d83212d5dd67 ("kallsyms, x86: Export addresses of PTI entry trampolines") to simplify the code and avoid the warning. Signed-off-by: Arnd Bergmann Tested-by: Alan Maguire [mcgrof: fold in bpf selftest fix] Signed-off-by: Luis Chamberlain --- kernel/kallsyms.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 77747391f49b..290d8cc1943a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -646,7 +646,6 @@ int sprint_backtrace_build_id(char *buffer, unsigned long address) /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; - loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; loff_t pos_bpf_end; @@ -659,29 +658,9 @@ struct kallsym_iter { int show_value; }; -int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name) -{ - return -EINVAL; -} - -static int get_ksymbol_arch(struct kallsym_iter *iter) -{ - int ret = arch_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, &iter->type, - iter->name); - - if (ret < 0) { - iter->pos_arch_end = iter->pos; - return 0; - } - - return 1; -} - static int get_ksymbol_mod(struct kallsym_iter *iter) { - int ret = module_get_kallsym(iter->pos - iter->pos_arch_end, + int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); @@ -764,7 +743,6 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; if (new_pos == 0) { - iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; iter->pos_bpf_end = 0; @@ -780,10 +758,6 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; - if ((!iter->pos_arch_end || iter->pos_arch_end > pos) && - get_ksymbol_arch(iter)) - return 1; - if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && get_ksymbol_mod(iter)) return 1; -- cgit v1.2.3 From c7dce4c5d9f6b17feec5ec6056453d019ee4d13b Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Tue, 16 May 2023 14:39:56 +0000 Subject: tracing: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement with strlcpy is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Acked-by: Masami Hiramatsu (Google) Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20230516143956.1367827-1-azeemshaikh38@gmail.com --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_events.c | 4 ++-- kernel/trace/trace_events_inject.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebc59781456a..28ccd0c9bdf0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -196,7 +196,7 @@ static int boot_snapshot_index; static int __init set_cmdline_ftrace(char *str) { - strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = true; @@ -281,7 +281,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { - strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); @@ -291,7 +291,7 @@ static char *trace_boot_clock __initdata; static int __init set_trace_boot_clock(char *str) { - strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; return 1; } @@ -2521,7 +2521,7 @@ static void __trace_find_cmdline(int pid, char comm[]) if (map != NO_CMDLINE_MAP) { tpid = savedcmd->map_cmdline_to_pid[map]; if (tpid == pid) { - strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); return; } } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 654ffa40457a..dc83a259915b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2831,7 +2831,7 @@ static __init int setup_trace_triggers(char *str) char *buf; int i; - strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; disable_tracing_selftest("running event triggers"); @@ -3621,7 +3621,7 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { - strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; disable_tracing_selftest("running event tracing"); diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index d6b4935a78c0..abe805d471eb 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -217,7 +217,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) char *addr = (char *)(unsigned long) val; if (field->filter_type == FILTER_STATIC_STRING) { - strlcpy(entry + field->offset, addr, field->size); + strscpy(entry + field->offset, addr, field->size); } else if (field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; @@ -232,7 +232,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) } entry = *pentry; - strlcpy(entry + (entry_size - str_len), addr, str_len); + strscpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); if (field->filter_type == FILTER_RDYN_STRING) str_loc -= field->offset + field->size; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 59cda19a9033..1b3fa7b854aa 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,7 +30,7 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; static int __init set_kprobe_boot_events(char *str) { - strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); return 1; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..73055ba8d8ef 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -254,7 +254,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, trace_probe_log_err(offset, GROUP_TOO_LONG); return -EINVAL; } - strlcpy(buf, event, slash - event + 1); + strscpy(buf, event, slash - event + 1); if (!is_good_system_name(buf)) { trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; -- cgit v1.2.3 From 4f521bab5bfc854ec0dab7ef560dfa75247e615d Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 26 May 2023 12:51:23 +0530 Subject: kallsyms: remove unsed API lookup_symbol_attrs with commit '7878c231dae0 ("slab: remove /proc/slab_allocators")' lookup_symbol_attrs usage is removed. Thus removing redundant API. Signed-off-by: Maninder Singh Reviewed-by: Kees Cook Signed-off-by: Luis Chamberlain --- kernel/kallsyms.c | 28 ---------------------------- kernel/module/kallsyms.c | 28 ---------------------------- 2 files changed, 56 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 290d8cc1943a..8193e947aa10 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -484,34 +484,6 @@ found: return 0; } -int lookup_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - int res; - - name[0] = '\0'; - name[KSYM_NAME_LEN - 1] = '\0'; - - if (is_ksym_addr(addr)) { - unsigned long pos; - - pos = get_symbol_pos(addr, size, offset); - /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), - name, KSYM_NAME_LEN); - modname[0] = '\0'; - goto found; - } - /* See if it's in a module. */ - res = lookup_module_symbol_attrs(addr, size, offset, modname, name); - if (res) - return res; - -found: - cleanup_symbol_name(name); - return 0; -} - /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset, int add_buildid) diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index c550d7d45f2f..ef73ae7c8909 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -381,34 +381,6 @@ out: return -ERANGE; } -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, size, offset); - if (!sym) - goto out; - if (modname) - strscpy(modname, mod->name, MODULE_NAME_LEN); - if (name) - strscpy(name, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) { -- cgit v1.2.3 From 49c386ebbb43394ff4773ce24f726f6afc4c30c8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 May 2023 22:23:35 +0900 Subject: Revert "kheaders: substituting --sort in archive creation" This reverts commit 700dea5a0bea9f64eba89fae7cb2540326fdfdc1. The reason for that commit was --sort=ORDER introduced in tar 1.28 (2014). More than 3 years have passed since then. Requiring GNU tar 1.28 should be fine now because we require GCC 5.1 (2015). Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- kernel/gen_kheaders.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 1ef9a87511f5..6d443ea22bb7 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -83,12 +83,9 @@ find $cpio_dir -type f -print0 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' # Create archive and try to normalize metadata for reproducibility. -# For compatibility with older versions of tar, files are fed to tar -# pre-sorted, as --sort=name might not be available. -find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ - tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --owner=0 --group=0 --numeric-owner --no-recursion \ - -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null +tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --owner=0 --group=0 --sort=name --numeric-owner \ + -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 -- cgit v1.2.3 From 143f83e2003a4c3ca0c2558254129569048e0759 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 26 May 2023 10:58:20 +0100 Subject: perf: Allow a PMU to have a parent Some PMUs have well defined parents such as PCI devices. As the device_initialize() and device_add() are all within pmu_dev_alloc() which is called from perf_pmu_register() there is no opportunity to set the parent from within a driver. Add a struct device *parent field to struct pmu and use that to set the parent. Reviewed-by: Dan Williams Reviewed-by: Greg Kroah-Hartman Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20230526095824.16336-2-Jonathan.Cameron@huawei.com Signed-off-by: Dan Williams --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index db016e418931..285cf6ca6e81 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11379,6 +11379,7 @@ static int pmu_dev_alloc(struct pmu *pmu) dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; + pmu->dev->parent = pmu->parent; pmu->dev->release = pmu_dev_release; ret = dev_set_name(pmu->dev, "%s", pmu->name); -- cgit v1.2.3 From d55ebae3f3122b07689cc4c34043114e09ce904c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 May 2023 21:50:17 +0200 Subject: sched: Hide unused sched_update_scaling() This function is only used when CONFIG_SMP is enabled, without that there is no caller and no prototype: kernel/sched/fair.c:688:5: error: no previous prototype for 'sched_update_scaling' [-Werror=missing-prototypes Hide the definition in the same #ifdef check as the declaration. Fixes: 8a99b6833c88 ("sched: Move SCHED_DEBUG sysctl to debugfs") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20230522195021.3456768-2-arnd@kernel.org --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 48b6f0ca13ac..2c1b345c3b8d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -684,7 +684,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ - +#ifdef CONFIG_SMP int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -702,6 +702,7 @@ int sched_update_scaling(void) return 0; } #endif +#endif /* * delta /= w -- cgit v1.2.3 From 378be384e01f13fc44d0adc70873de525586ad74 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 May 2023 21:50:18 +0200 Subject: sched: Add schedule_user() declaration The schedule_user() function is used on powerpc and sparc architectures, but only ever called from assembler, so it has no prototype, causing a harmless W=1 warning: kernel/sched/core.c:6730:35: error: no previous prototype for 'schedule_user' [-Werror=missing-prototypes] Add a prototype in sched/sched.h to shut up the warning. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20230522195021.3456768-3-arnd@kernel.org --- kernel/sched/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 678446251c35..192e7816234e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2376,6 +2376,7 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) #endif extern void schedule_idle(void); +asmlinkage void schedule_user(void); extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); -- cgit v1.2.3 From c0bdfd72fbfb7319581bd5bb09b4f10979385bac Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 May 2023 21:50:19 +0200 Subject: sched/fair: Hide unused init_cfs_bandwidth() stub init_cfs_bandwidth() is only used when CONFIG_FAIR_GROUP_SCHED is enabled, and without this causes a W=1 warning for the missing prototype: kernel/sched/fair.c:6131:6: error: no previous prototype for 'init_cfs_bandwidth' The normal implementation is only defined for CONFIG_CFS_BANDWIDTH, so the stub exists when CFS_BANDWIDTH is disabled but FAIR_GROUP_SCHED is enabled. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20230522195021.3456768-4-arnd@kernel.org --- kernel/sched/fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2c1b345c3b8d..a7a8ccde3bd7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6169,9 +6169,8 @@ static inline int throttled_lb_pair(struct task_group *tg, return 0; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} - #ifdef CONFIG_FAIR_GROUP_SCHED +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif -- cgit v1.2.3 From f7df852ad6dbb84644e75df7402d9a34f39f31bd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 May 2023 21:50:20 +0200 Subject: sched: Make task_vruntime_update() prototype visible Having the prototype next to the caller but not visible to the callee causes a W=1 warning: kernel/sched/fair.c:11985:6: error: no previous prototype for 'task_vruntime_update' [-Werror=missing-prototypes] Move this to a header, as we do for all other function declarations. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20230522195021.3456768-5-arnd@kernel.org --- kernel/sched/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 192e7816234e..ce07782f0f6c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1245,6 +1245,7 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi); +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie -- cgit v1.2.3 From 7aa55f2a5902646a19db89dab9961867724b27b8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 May 2023 21:50:21 +0200 Subject: sched/fair: Move unused stub functions to header These four functions have a normal definition for CONFIG_FAIR_GROUP_SCHED, and empty one that is only referenced when FAIR_GROUP_SCHED is disabled but CGROUP_SCHED is still enabled. If both are turned off, the functions are still defined but the misisng prototype causes a W=1 warning: kernel/sched/fair.c:12544:6: error: no previous prototype for 'free_fair_sched_group' kernel/sched/fair.c:12546:5: error: no previous prototype for 'alloc_fair_sched_group' kernel/sched/fair.c:12553:6: error: no previous prototype for 'online_fair_sched_group' kernel/sched/fair.c:12555:6: error: no previous prototype for 'unregister_fair_sched_group' Move the alternatives into the header as static inline functions with the correct combination of #ifdef checks to avoid the warning without adding even more complexity. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20230522195021.3456768-6-arnd@kernel.org --- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a7a8ccde3bd7..48b6f0ca13ac 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -684,7 +684,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ -#ifdef CONFIG_SMP + int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -702,7 +702,6 @@ int sched_update_scaling(void) return 0; } #endif -#endif /* * delta /= w @@ -6169,8 +6168,9 @@ static inline int throttled_lb_pair(struct task_group *tg, return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +#ifdef CONFIG_FAIR_GROUP_SCHED static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce07782f0f6c..678446251c35 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1245,7 +1245,6 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi); -void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -2377,7 +2376,6 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) #endif extern void schedule_idle(void); -asmlinkage void schedule_user(void); extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); -- cgit v1.2.3 From 3f4bf7aa315bf55b2a569bf77f61ff81c7e11fc1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 May 2023 18:25:14 +0800 Subject: sched/deadline: remove unused dl_bandwidth The default deadline bandwidth control structure has been removed since commit eb77cf1c151c ("sched/deadline: Remove unused def_dl_bandwidth") leading to unused init_dl_bandwidth() and struct dl_bandwidth. Remove them to clean up the code. Signed-off-by: Miaohe Lin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20230524102514.407486-1-linmiaohe@huawei.com --- kernel/sched/deadline.c | 7 ------- kernel/sched/sched.h | 7 ------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a9a4b81c972..f827067ad03b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -489,13 +489,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); -void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) -{ - raw_spin_lock_init(&dl_b->dl_runtime_lock); - dl_b->dl_period = period; - dl_b->dl_runtime = runtime; -} - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 678446251c35..d8ba81c66579 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -286,12 +286,6 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); -struct dl_bandwidth { - raw_spinlock_t dl_runtime_lock; - u64 dl_runtime; - u64 dl_period; -}; - static inline int dl_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -2394,7 +2388,6 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); -- cgit v1.2.3 From e6c2f594ed961273479505b42040782820190305 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 30 May 2023 13:50:29 -0700 Subject: bpf: Silence a warning in btf_type_id_size() syzbot reported a warning in [1] with the following stacktrace: WARNING: CPU: 0 PID: 5005 at kernel/bpf/btf.c:1988 btf_type_id_size+0x2d9/0x9d0 kernel/bpf/btf.c:1988 ... RIP: 0010:btf_type_id_size+0x2d9/0x9d0 kernel/bpf/btf.c:1988 ... Call Trace: map_check_btf kernel/bpf/syscall.c:1024 [inline] map_create+0x1157/0x1860 kernel/bpf/syscall.c:1198 __sys_bpf+0x127f/0x5420 kernel/bpf/syscall.c:5040 __do_sys_bpf kernel/bpf/syscall.c:5162 [inline] __se_sys_bpf kernel/bpf/syscall.c:5160 [inline] __x64_sys_bpf+0x79/0xc0 kernel/bpf/syscall.c:5160 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd With the following btf [1] DECL_TAG 'a' type_id=4 component_idx=-1 [2] PTR '(anon)' type_id=0 [3] TYPE_TAG 'a' type_id=2 [4] VAR 'a' type_id=3, linkage=static and when the bpf_attr.btf_key_type_id = 1 (DECL_TAG), the following WARN_ON_ONCE in btf_type_id_size() is triggered: if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && !btf_type_is_var(size_type))) return NULL; Note that 'return NULL' is the correct behavior as we don't want a DECL_TAG type to be used as a btf_{key,value}_type_id even for the case like 'DECL_TAG -> STRUCT'. So there is no correctness issue here, we just want to silence warning. To silence the warning, I added DECL_TAG as one of kinds in btf_type_nosize() which will cause btf_type_id_size() returning NULL earlier without the warning. [1] https://lore.kernel.org/bpf/000000000000e0df8d05fc75ba86@google.com/ Reported-by: syzbot+958967f249155967d42a@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230530205029.264910-1-yhs@fb.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 947f0b83bfad..bd2cac057928 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -492,25 +492,26 @@ static bool btf_type_is_fwd(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } -static bool btf_type_nosize(const struct btf_type *t) +static bool btf_type_is_datasec(const struct btf_type *t) { - return btf_type_is_void(t) || btf_type_is_fwd(t) || - btf_type_is_func(t) || btf_type_is_func_proto(t); + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } -static bool btf_type_nosize_or_null(const struct btf_type *t) +static bool btf_type_is_decl_tag(const struct btf_type *t) { - return !t || btf_type_nosize(t); + return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; } -static bool btf_type_is_datasec(const struct btf_type *t) +static bool btf_type_nosize(const struct btf_type *t) { - return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; + return btf_type_is_void(t) || btf_type_is_fwd(t) || + btf_type_is_func(t) || btf_type_is_func_proto(t) || + btf_type_is_decl_tag(t); } -static bool btf_type_is_decl_tag(const struct btf_type *t) +static bool btf_type_nosize_or_null(const struct btf_type *t) { - return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; + return !t || btf_type_nosize(t); } static bool btf_type_is_decl_tag_target(const struct btf_type *t) -- cgit v1.2.3 From 996ef312f27fa8ee8715c6ec77b6a3cdb748bdca Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 18 May 2023 13:40:15 -0700 Subject: sysctl: remove empty dev table Now that all the dev sysctls have been moved out we can remove the dev sysctl base directory. We don't need to create base directories, they are created for you as if using 'mkdir -p' with register_syctl() and register_sysctl_init(). For details refer to sysctl_mkdir_p() usage. We save 90 bytes with this changes: ./scripts/bloat-o-meter vmlinux.2.remove-sysctl-table vmlinux.3-remove-dev-table add/remove: 0/1 grow/shrink: 0/1 up/down: 0/-90 (-90) Function old new delta sysctl_init_bases 111 85 -26 dev_table 64 - -64 Total: Before=21257057, After=21256967, chg -0.00% The empty dev table has been in place since the v2.5.0 days because back then ordering was essentialy. But later commit 7ec66d06362d ("sysctl: Stop requiring explicit management of sysctl directories"), merged as of v3.4-rc1, the entire ordering of directories was replaced by allowing sysctl directory autogeneration. This new mechanism introduced on v3.4 allows for sysctl directories to automatically be created for sysctl tables when they are needed and automatically removes them when no sysctl tables use them. That commit also added a dedicated struct ctl_dir as a new type for these autogenerated directories. Reviewed-by: Joel Granados Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 73fa9cf7ee11..9552f221f568 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2332,10 +2332,6 @@ static struct ctl_table debug_table[] = { { } }; -static struct ctl_table dev_table[] = { - { } -}; - int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); @@ -2346,7 +2342,6 @@ int __init sysctl_init_bases(void) register_sysctl_init("vm", vm_table); register_sysctl_init("debug", debug_table); - register_sysctl_init("dev", dev_table); return 0; } -- cgit v1.2.3 From 01e6aac78b1c5c9e7115c15f5dbf942959b8f3ad Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 18 May 2023 13:37:41 -0700 Subject: signal: move show_unhandled_signals sysctl to its own file The show_unhandled_signals sysctl is the only sysctl for debug left on kernel/sysctl.c. We've been moving the syctls out from kernel/sysctl.c so to help avoid merge conflicts as the shared array gets out of hand. This change incurs simplifies sysctl registration by localizing it where it should go for a penalty in size of increasing the kernel by 23 bytes, we accept this given recent cleanups have actually already saved us 1465 bytes in the prior commits. ./scripts/bloat-o-meter vmlinux.3-remove-dev-table vmlinux.4-remove-debug-table add/remove: 3/1 grow/shrink: 0/1 up/down: 177/-154 (23) Function old new delta signal_debug_table - 128 +128 init_signal_sysctls - 33 +33 __pfx_init_signal_sysctls - 16 +16 sysctl_init_bases 85 59 -26 debug_table 128 - -128 Total: Before=21256967, After=21256990, chg +0.00% Reviewed-by: Joel Granados Signed-off-by: Luis Chamberlain --- kernel/signal.c | 23 +++++++++++++++++++++++ kernel/sysctl.c | 14 -------------- 2 files changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8f6330f0e9ca..5ba4150c01a7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -45,6 +45,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -4771,6 +4772,28 @@ static inline void siginfo_buildtime_checks(void) #endif } +#if defined(CONFIG_SYSCTL) +static struct ctl_table signal_debug_table[] = { +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE + { + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { } +}; + +static int __init init_signal_sysctls(void) +{ + register_sysctl_init("debug", signal_debug_table); + return 0; +} +early_initcall(init_signal_sysctls); +#endif /* CONFIG_SYSCTL */ + void __init signals_init(void) { siginfo_buildtime_checks(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9552f221f568..241b817c0240 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2319,19 +2319,6 @@ static struct ctl_table vm_table[] = { { } }; -static struct ctl_table debug_table[] = { -#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE - { - .procname = "exception-trace", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif - { } -}; - int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); @@ -2341,7 +2328,6 @@ int __init sysctl_init_bases(void) #endif register_sysctl_init("vm", vm_table); - register_sysctl_init("debug", debug_table); return 0; } -- cgit v1.2.3 From d0c2d66fcc8db748edfe60e3b443eaff931f50e9 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Wed, 17 May 2023 14:53:23 +0000 Subject: ftrace: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Acked-by: Masami Hiramatsu (Google) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20230517145323.1522010-1-azeemshaikh38@gmail.com --- kernel/trace/ftrace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 764668467155..6a77edb51f18 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5743,7 +5743,7 @@ bool ftrace_filter_param __initdata; static int __init set_ftrace_notrace(char *str) { ftrace_filter_param = true; - strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); @@ -5751,7 +5751,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { ftrace_filter_param = true; - strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); @@ -5763,14 +5763,14 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static int __init set_graph_function(char *str) { - strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_filter=", set_graph_function); static int __init set_graph_notrace_function(char *str) { - strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_notrace=", set_graph_notrace_function); @@ -6569,8 +6569,8 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, continue; *value = op->trampoline; *type = 't'; - strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); - strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + strscpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strscpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); *exported = 0; return 0; } @@ -6933,7 +6933,7 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, if (off) *off = addr - found_func->ip; if (sym) - strlcpy(sym, found_func->name, KSYM_NAME_LEN); + strscpy(sym, found_func->name, KSYM_NAME_LEN); return found_func->name; } @@ -6987,8 +6987,8 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, *value = mod_func->ip; *type = 'T'; - strlcpy(name, mod_func->name, KSYM_NAME_LEN); - strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); + strscpy(name, mod_func->name, KSYM_NAME_LEN); + strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); *exported = 1; preempt_enable(); return 0; -- cgit v1.2.3 From ffadc372529e268b54c5b98f56da07d8024fa1cb Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Tue, 30 May 2023 15:56:59 +0000 Subject: bpf: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. This is not the case here, however, in an effort to remove strlcpy() completely [2], lets replace strlcpy() here with strscpy(). No return values were used, so a direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Signed-off-by: Daniel Borkmann Reviewed-by: Kees Cook Link: https://lore.kernel.org/bpf/20230530155659.309657-1-azeemshaikh38@gmail.com --- kernel/bpf/preload/bpf_preload_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index b56f9f3314fd..0c63bc2cd895 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -23,9 +23,9 @@ static void free_links_and_skel(void) static int preload(struct bpf_preload_info *obj) { - strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); + strscpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); obj[0].link = maps_link; - strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); + strscpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); obj[1].link = progs_link; return 0; } -- cgit v1.2.3 From 76edc27eda068fb8222c452d522d4c93bcebe557 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Tue, 30 May 2023 16:35:46 +0000 Subject: clocksource: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Acked-by: John Stultz Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20230530163546.986188-1-azeemshaikh38@gmail.com --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 91836b727cef..88cbc1181b23 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1480,7 +1480,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strlcpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str, sizeof(override_name)); mutex_unlock(&clocksource_mutex); return 1; } -- cgit v1.2.3 From 0f613bfa8268a89be25f2b6b58fc6fe8ccd9a2ba Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 5 Jun 2023 08:01:15 +0100 Subject: locking/atomic: treewide: use raw_atomic*_() Now that we have raw_atomic*_() definitions, there's no need to use arch_atomic*_() definitions outside of the low-level atomic definitions. Move treewide users of arch_atomic*_() over to the equivalent raw_atomic*_(). There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20230605070124.3741859-19-mark.rutland@arm.com --- kernel/context_tracking.c | 12 ++++++------ kernel/sched/clock.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index a09f1c19336a..6ef0b35fc28c 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - arch_atomic_set(&ct->state, state); + raw_atomic_set(&ct->state, state); } else { /* * Even if context tracking is disabled on this CPU, because it's outside @@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state) */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - arch_atomic_set(&ct->state, state); + raw_atomic_set(&ct->state, state); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - arch_atomic_add(state, &ct->state); + raw_atomic_add(state, &ct->state); } } } @@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - arch_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CONTEXT_KERNEL); } else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - arch_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CONTEXT_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - arch_atomic_sub(state, &ct->state); + raw_atomic_sub(state, &ct->state); } } } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b5cc2b53464d..71443cff31f0 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -287,7 +287,7 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock)) + if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; -- cgit v1.2.3 From 42cffe980ce383893660d78e33340763ca1dadae Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 30 May 2023 16:15:58 -0700 Subject: livepatch: Make 'klp_stack_entries' static The 'klp_stack_entries' percpu array is only used in transition.c. Make it static. Fixes: e92606fa172f ("livepatch: Convert stack entries array to percpu") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202305171329.i0UQ4TJa-lkp@intel.com/ Signed-off-by: Josh Poimboeuf Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/5115752fca6537720700f4bf5b178959dfbca41a.1685488550.git.jpoimboe@kernel.org --- kernel/livepatch/transition.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index e9fd83a02228..e54c3d60a904 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -15,7 +15,7 @@ #include "transition.h" #define MAX_STACK_ENTRIES 100 -DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries); +static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries); #define STACK_ERR_BUF_SIZE 128 -- cgit v1.2.3 From 0718afd47f70cf46877c39c25d06b786e1a3f36c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:52 +0200 Subject: block: introduce holder ops Add a new blk_holder_ops structure, which is passed to blkdev_get_by_* and installed in the block_device for exclusive claims. It will be used to allow the block layer to call back into the user of the block device for thing like notification of a removed device or a device resize. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-10-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 92e41ed292ad..801c411530d1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -357,7 +357,7 @@ static int swsusp_swap_check(void) root_swap = res; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, - NULL); + NULL, NULL); if (IS_ERR(hib_resume_bdev)) return PTR_ERR(hib_resume_bdev); @@ -1524,7 +1524,7 @@ int swsusp_check(void) mode |= FMODE_EXCL; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - mode, &holder); + mode, &holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); -- cgit v1.2.3 From 02b42d58f3898134b900ff3030561099e38adb32 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:13 +0200 Subject: PM: hibernate: factor out a helper to find the resume device Split the logic to find the resume device out software_resume and into a separate helper to start unwindig the convoluted goto logic. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230531125535.676098-3-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 72 +++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30d1274f03f6..072795063662 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -910,6 +910,41 @@ unlock: } EXPORT_SYMBOL_GPL(hibernate_quiet_exec); +static int find_resume_device(void) +{ + if (!strlen(resume_file)) + return -ENOENT; + + pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); + + if (resume_delay) { + pr_info("Waiting %dsec before reading resume device ...\n", + resume_delay); + ssleep(resume_delay); + } + + /* Check if the device is there */ + swsusp_resume_device = name_to_dev_t(resume_file); + if (swsusp_resume_device) + return 0; + + /* + * Some device discovery might still be in progress; we need to wait for + * this to finish. + */ + wait_for_device_probe(); + if (resume_wait) { + while (!(swsusp_resume_device = name_to_dev_t(resume_file))) + msleep(10); + async_synchronize_full(); + } + + swsusp_resume_device = name_to_dev_t(resume_file); + if (!swsusp_resume_device) + return -ENODEV; + return 0; +} + /** * software_resume - Resume from a saved hibernation image. * @@ -949,45 +984,12 @@ static int software_resume(void) snapshot_test = false; - if (swsusp_resume_device) - goto Check_image; - - if (!strlen(resume_file)) { - error = -ENOENT; - goto Unlock; - } - - pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); - - if (resume_delay) { - pr_info("Waiting %dsec before reading resume device ...\n", - resume_delay); - ssleep(resume_delay); - } - - /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { - /* - * Some device discovery might still be in progress; we need - * to wait for this to finish. - */ - wait_for_device_probe(); - - if (resume_wait) { - while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) - msleep(10); - async_synchronize_full(); - } - - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - error = -ENODEV; + error = find_resume_device(); + if (error) goto Unlock; - } } - Check_image: pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); -- cgit v1.2.3 From d6545e687271ab27472eebff770f2de6a5f1a464 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:14 +0200 Subject: PM: hibernate: remove the global snapshot_test variable Passing call dependent variable in global variables is a huge antipattern. Fix it up. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230531125535.676098-4-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 17 ++++++----------- kernel/power/power.h | 3 +-- kernel/power/swap.c | 2 +- 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 072795063662..78696aa04f5c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -64,7 +64,6 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; bool freezer_test_done; -bool snapshot_test; static const struct platform_hibernation_ops *hibernation_ops; @@ -684,7 +683,7 @@ static void power_down(void) cpu_relax(); } -static int load_image_and_restore(void) +static int load_image_and_restore(bool snapshot_test) { int error; unsigned int flags; @@ -721,6 +720,7 @@ static int load_image_and_restore(void) */ int hibernate(void) { + bool snapshot_test = false; unsigned int sleep_flags; int error; @@ -748,9 +748,6 @@ int hibernate(void) if (error) goto Exit; - /* protected by system_transition_mutex */ - snapshot_test = false; - lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); @@ -792,9 +789,9 @@ int hibernate(void) unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); - error = swsusp_check(); + error = swsusp_check(snapshot_test); if (!error) - error = load_image_and_restore(); + error = load_image_and_restore(snapshot_test); } thaw_processes(); @@ -982,8 +979,6 @@ static int software_resume(void) */ mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); - snapshot_test = false; - if (!swsusp_resume_device) { error = find_resume_device(); if (error) @@ -994,7 +989,7 @@ static int software_resume(void) MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pm_pr_dbg("Looking for hibernation image.\n"); - error = swsusp_check(); + error = swsusp_check(false); if (error) goto Unlock; @@ -1022,7 +1017,7 @@ static int software_resume(void) goto Close_Finish; } - error = load_image_and_restore(); + error = load_image_and_restore(false); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); diff --git a/kernel/power/power.h b/kernel/power/power.h index b83c8d5e188d..978189fcafd1 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -59,7 +59,6 @@ asmlinkage int swsusp_save(void); /* kernel/power/hibernate.c */ extern bool freezer_test_done; -extern bool snapshot_test; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); @@ -174,7 +173,7 @@ extern int swsusp_swap_in_use(void); #define SF_HW_SIG 8 /* kernel/power/hibernate.c */ -extern int swsusp_check(void); +int swsusp_check(bool snapshot_test); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 801c411530d1..81aec3b2c605 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1514,7 +1514,7 @@ end: * swsusp_check - Check for swsusp signature in the resume device */ -int swsusp_check(void) +int swsusp_check(bool snapshot_test) { int error; void *holder; -- cgit v1.2.3 From cc89c63e2fe37d476357c82390dfb12edcd41cdd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:15 +0200 Subject: PM: hibernate: move finding the resume device out of software_resume software_resume can be called either from an init call in the boot code, or from sysfs once the system has finished booting, and the two invocation methods this can't race with each other. For the latter case we did just parse the suspend device manually, while the former might not have one. Split software_resume so that the search only happens for the boot case, which also means the special lockdep nesting annotation can go away as the system transition mutex can be taken a little later and doesn't have the sysfs locking nest inside it. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230531125535.676098-5-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 80 +++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 78696aa04f5c..45e24b02cd50 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -907,7 +907,7 @@ unlock: } EXPORT_SYMBOL_GPL(hibernate_quiet_exec); -static int find_resume_device(void) +static int __init find_resume_device(void) { if (!strlen(resume_file)) return -ENOENT; @@ -942,53 +942,16 @@ static int find_resume_device(void) return 0; } -/** - * software_resume - Resume from a saved hibernation image. - * - * This routine is called as a late initcall, when all devices have been - * discovered and initialized already. - * - * The image reading code is called to see if there is a hibernation image - * available for reading. If that is the case, devices are quiesced and the - * contents of memory is restored from the saved image. - * - * If this is successful, control reappears in the restored target kernel in - * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine - * attempts to recover gracefully and make the kernel return to the normal mode - * of operation. - */ static int software_resume(void) { int error; - /* - * If the user said "noresume".. bail out early. - */ - if (noresume || !hibernation_available()) - return 0; - - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take system_transition_mutex) - * this can cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); - - if (!swsusp_resume_device) { - error = find_resume_device(); - if (error) - goto Unlock; - } - pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pm_pr_dbg("Looking for hibernation image.\n"); + + mutex_lock(&system_transition_mutex); error = swsusp_check(false); if (error) goto Unlock; @@ -1035,7 +998,39 @@ static int software_resume(void) goto Finish; } -late_initcall_sync(software_resume); +/** + * software_resume_initcall - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int __init software_resume_initcall(void) +{ + /* + * If the user said "noresume".. bail out early. + */ + if (noresume || !hibernation_available()) + return 0; + + if (!swsusp_resume_device) { + int error = find_resume_device(); + + if (error) + return error; + } + + return software_resume(); +} +late_initcall_sync(software_resume_initcall); static const char * const hibernation_modes[] = { @@ -1176,6 +1171,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, char *name; dev_t res; + if (!hibernation_available()) + return 0; + if (len && buf[len-1] == '\n') len--; name = kstrndup(buf, len, GFP_KERNEL); -- cgit v1.2.3 From cf056a43121559d3642419917d405c3237ded90a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:24 +0200 Subject: init: improve the name_to_dev_t interface name_to_dev_t has a very misleading name, that doesn't make clear it should only be used by the early init code, and also has a bad calling convention that doesn't allow returning different kinds of errors. Rename it to early_lookup_bdev to make the use case clear, and return an errno, where -EINVAL means the string could not be parsed, and -ENODEV means it the string was valid, but there was no device found for it. Also stub out the whole call for !CONFIG_BLOCK as all the non-block root cases are always covered in the caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-14-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 45e24b02cd50..c52dedb9f7c8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "PM: hibernation: " fmt +#include #include #include #include @@ -921,8 +922,7 @@ static int __init find_resume_device(void) } /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); - if (swsusp_resume_device) + if (!early_lookup_bdev(resume_file, &swsusp_resume_device)) return 0; /* @@ -931,15 +931,12 @@ static int __init find_resume_device(void) */ wait_for_device_probe(); if (resume_wait) { - while (!(swsusp_resume_device = name_to_dev_t(resume_file))) + while (early_lookup_bdev(resume_file, &swsusp_resume_device)) msleep(10); async_synchronize_full(); } - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) - return -ENODEV; - return 0; + return early_lookup_bdev(resume_file, &swsusp_resume_device); } static int software_resume(void) @@ -1169,7 +1166,8 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, unsigned int sleep_flags; int len = n; char *name; - dev_t res; + dev_t dev; + int error; if (!hibernation_available()) return 0; @@ -1180,13 +1178,13 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!name) return -ENOMEM; - res = name_to_dev_t(name); + error = early_lookup_bdev(name, &dev); kfree(name); - if (!res) - return -EINVAL; + if (error) + return error; sleep_flags = lock_system_sleep(); - swsusp_resume_device = res; + swsusp_resume_device = dev; unlock_system_sleep(sleep_flags); pm_pr_dbg("Configured hibernation resume from disk to %u\n", -- cgit v1.2.3 From 1e8c813b083c4122dfeaa5c3b11028331026e85d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:32 +0200 Subject: PM: hibernate: don't use early_lookup_bdev in resume_store resume_store is a sysfs attribute written during normal kernel runtime, and it should not use the early_lookup_bdev API that bypasses all normal path based permission checking, and might cause problems with certain container environments renaming devices. Switch to lookup_bdev, which does a normal path lookup instead, and fall back to trying to parse a numeric dev_t just like early_lookup_bdev did. Note that this strictly speaking changes the kernel ABI as the PARTUUID= and PARTLABEL= style syntax is now not available during a running systems. They never were intended for that, but this breaks things we'll have to figure out a way to make them available again. But if avoidable in any way I'd rather avoid that. Fixes: 421a5fa1a6cf ("PM / hibernate: use name_to_dev_t to parse resume") Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230531125535.676098-22-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c52dedb9f7c8..7ae95ec72f99 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1178,7 +1178,23 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!name) return -ENOMEM; - error = early_lookup_bdev(name, &dev); + error = lookup_bdev(name, &dev); + if (error) { + unsigned maj, min, offset; + char *p, dummy; + + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, + &dummy) == 3) { + dev = MKDEV(maj, min); + if (maj != MAJOR(dev) || min != MINOR(dev)) + error = -EINVAL; + } else { + dev = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + error = -EINVAL; + } + } kfree(name); if (error) return error; -- cgit v1.2.3 From 0dd37d6dd33a9c23351e6115ae8cdac7863bc7de Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 30 May 2023 16:25:07 +0800 Subject: sched/fair: Don't balance task to its current running CPU We've run into the case that the balancer tries to balance a migration disabled task and trigger the warning in set_task_cpu() like below: ------------[ cut here ]------------ WARNING: CPU: 7 PID: 0 at kernel/sched/core.c:3115 set_task_cpu+0x188/0x240 Modules linked in: hclgevf xt_CHECKSUM ipt_REJECT nf_reject_ipv4 <...snip> CPU: 7 PID: 0 Comm: swapper/7 Kdump: loaded Tainted: G O 6.1.0-rc4+ #1 Hardware name: Huawei TaiShan 2280 V2/BC82AMDC, BIOS 2280-V2 CS V5.B221.01 12/09/2021 pstate: 604000c9 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : set_task_cpu+0x188/0x240 lr : load_balance+0x5d0/0xc60 sp : ffff80000803bc70 x29: ffff80000803bc70 x28: ffff004089e190e8 x27: ffff004089e19040 x26: ffff007effcabc38 x25: 0000000000000000 x24: 0000000000000001 x23: ffff80000803be84 x22: 000000000000000c x21: ffffb093e79e2a78 x20: 000000000000000c x19: ffff004089e19040 x18: 0000000000000000 x17: 0000000000001fad x16: 0000000000000030 x15: 0000000000000000 x14: 0000000000000003 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000001 x10: 0000000000000400 x9 : ffffb093e4cee530 x8 : 00000000fffffffe x7 : 0000000000ce168a x6 : 000000000000013e x5 : 00000000ffffffe1 x4 : 0000000000000001 x3 : 0000000000000b2a x2 : 0000000000000b2a x1 : ffffb093e6d6c510 x0 : 0000000000000001 Call trace: set_task_cpu+0x188/0x240 load_balance+0x5d0/0xc60 rebalance_domains+0x26c/0x380 _nohz_idle_balance.isra.0+0x1e0/0x370 run_rebalance_domains+0x6c/0x80 __do_softirq+0x128/0x3d8 ____do_softirq+0x18/0x24 call_on_irq_stack+0x2c/0x38 do_softirq_own_stack+0x24/0x3c __irq_exit_rcu+0xcc/0xf4 irq_exit_rcu+0x18/0x24 el1_interrupt+0x4c/0xe4 el1h_64_irq_handler+0x18/0x2c el1h_64_irq+0x74/0x78 arch_cpu_idle+0x18/0x4c default_idle_call+0x58/0x194 do_idle+0x244/0x2b0 cpu_startup_entry+0x30/0x3c secondary_start_kernel+0x14c/0x190 __secondary_switched+0xb0/0xb4 ---[ end trace 0000000000000000 ]--- Further investigation shows that the warning is superfluous, the migration disabled task is just going to be migrated to its current running CPU. This is because that on load balance if the dst_cpu is not allowed by the task, we'll re-select a new_dst_cpu as a candidate. If no task can be balanced to dst_cpu we'll try to balance the task to the new_dst_cpu instead. In this case when the migration disabled task is not on CPU it only allows to run on its current CPU, load balance will select its current CPU as new_dst_cpu and later triggers the warning above. The new_dst_cpu is chosen from the env->dst_grpmask. Currently it contains CPUs in sched_group_span() and if we have overlapped groups it's possible to run into this case. This patch makes env->dst_grpmask of group_balance_mask() which exclude any CPUs from the busiest group and solve the issue. For balancing in a domain with no overlapped groups the behaviour keeps same as before. Suggested-by: Vincent Guittot Signed-off-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20230530082507.10444-1-yangyicong@huawei.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 48b6f0ca13ac..df0ff905a4aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10742,7 +10742,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_span(sd->groups), + .dst_grpmask = group_balance_mask(sd->groups), .idle = idle, .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, -- cgit v1.2.3 From d5e1586617be7093ea3419e3fa9387ed833cdbb1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Jun 2023 10:42:53 +0200 Subject: sched: Unconditionally use full-fat wait_task_inactive() While modifying wait_task_inactive() for PREEMPT_RT; the build robot noted that UP got broken. This led to audit and consideration of the UP implementation of wait_task_inactive(). It looks like the UP implementation is also broken for PREEMPT; consider task_current_syscall() getting preempted between the two calls to wait_task_inactive(). Therefore move the wait_task_inactive() implementation out of CONFIG_SMP and unconditionally use it. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230602103731.GA630648%40hirez.programming.kicks-ass.net --- kernel/sched/core.c | 216 ++++++++++++++++++++++++++-------------------------- 1 file changed, 108 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 944c3ae39861..810cf7dc98cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2213,6 +2213,114 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq); } +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) +{ + int running, queued; + struct rq_flags rf; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_on_cpu()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_on_cpu(rq, p)) { + if (!(READ_ONCE(p->__state) & match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &rf); + trace_sched_wait_task(p); + running = task_on_cpu(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; + if (READ_ONCE(p->__state) & match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &rf); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(queued)) { + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + #ifdef CONFIG_SMP static void @@ -3341,114 +3449,6 @@ out: } #endif /* CONFIG_NUMA_BALANCING */ -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * Wait for the thread to block in any of the states set in @match_state. - * If it changes, i.e. @p might have woken up, then return zero. When we - * succeed in waiting for @p to be off its CPU, we return a positive number - * (its total switch count). If a second call a short while later returns the - * same number, the caller can be sure that @p has remained unscheduled the - * whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -{ - int running, queued; - struct rq_flags rf; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_on_cpu()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_on_cpu(rq, p)) { - if (!(READ_ONCE(p->__state) & match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &rf); - trace_sched_wait_task(p); - running = task_on_cpu(rq, p); - queued = task_on_rq_queued(p); - ncsw = 0; - if (READ_ONCE(p->__state) & match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &rf); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(queued)) { - ktime_t to = NSEC_PER_SEC / HZ; - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - /*** * kick_process - kick a running thread to enter/exit the kernel * @p: the to-be-kicked thread -- cgit v1.2.3 From 1c06918788e8ae6e69e4381a2806617312922524 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 16:39:07 +0200 Subject: sched: Consider task_struct::saved_state in wait_task_inactive() With the introduction of task_struct::saved_state in commit 5f220be21418 ("sched/wakeup: Prepare for RT sleeping spin/rwlocks") matching the task state has gotten more complicated. That same commit changed try_to_wake_up() to consider both states, but wait_task_inactive() has been neglected. Sebastian noted that the wait_task_inactive() usage in ptrace_check_attach() can misbehave when ptrace_stop() is blocked on the tasklist_lock after it sets TASK_TRACED. Therefore extract a common helper from ttwu_state_match() and use that to teach wait_task_inactive() about the PREEMPT_RT locks. Originally-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20230601091234.GW83892@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 59 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 810cf7dc98cf..ac38225e6d09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2213,6 +2213,39 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq); } +static __always_inline +int __task_state_match(struct task_struct *p, unsigned int state) +{ + if (READ_ONCE(p->__state) & state) + return 1; + +#ifdef CONFIG_PREEMPT_RT + if (READ_ONCE(p->saved_state) & state) + return -1; +#endif + return 0; +} + +static __always_inline +int task_state_match(struct task_struct *p, unsigned int state) +{ +#ifdef CONFIG_PREEMPT_RT + int match; + + /* + * Serialize against current_save_and_set_rtlock_wait_state() and + * current_restore_rtlock_saved_state(). + */ + raw_spin_lock_irq(&p->pi_lock); + match = __task_state_match(p, state); + raw_spin_unlock_irq(&p->pi_lock); + + return match; +#else + return __task_state_match(p, state); +#endif +} + /* * wait_task_inactive - wait for a thread to unschedule. * @@ -2231,7 +2264,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) */ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { - int running, queued; + int running, queued, match; struct rq_flags rf; unsigned long ncsw; struct rq *rq; @@ -2257,7 +2290,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * is actually now running somewhere else! */ while (task_on_cpu(rq, p)) { - if (!(READ_ONCE(p->__state) & match_state)) + if (!task_state_match(p, match_state)) return 0; cpu_relax(); } @@ -2272,8 +2305,15 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (READ_ONCE(p->__state) & match_state) + if ((match = __task_state_match(p, match_state))) { + /* + * When matching on p->saved_state, consider this task + * still queued so it will wait. + */ + if (match < 0) + queued = 1; ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + } task_rq_unlock(rq, p, &rf); /* @@ -4003,15 +4043,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) { + int match; + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && state != TASK_RTLOCK_WAIT); } - if (READ_ONCE(p->__state) & state) { - *success = 1; - return true; - } + *success = !!(match = __task_state_match(p, state)); #ifdef CONFIG_PREEMPT_RT /* @@ -4027,12 +4066,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) * p::saved_state to TASK_RUNNING so any further tests will * not result in false positives vs. @success */ - if (p->saved_state & state) { + if (match < 0) p->saved_state = TASK_RUNNING; - *success = 1; - } #endif - return false; + return match > 0; } /* -- cgit v1.2.3 From d16317de9b412aa7bd3598c607112298e36b4352 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2023 12:20:59 +0200 Subject: seqlock/latch: Provide raw_read_seqcount_latch_retry() The read side of seqcount_latch consists of: do { seq = raw_read_seqcount_latch(&latch->seq); ... } while (read_seqcount_latch_retry(&latch->seq, seq)); which is asymmetric in the raw_ department, and sure enough, read_seqcount_latch_retry() includes (explicit) instrumentation where raw_read_seqcount_latch() does not. This inconsistency becomes a problem when trying to use it from noinstr code. As such, fix it by renaming and re-implementing raw_read_seqcount_latch_retry() without the instrumentation. Specifically the instrumentation in question is kcsan_atomic_next(0) in do___read_seqcount_retry(). Loosing this annotation is not a problem because raw_read_seqcount_latch() does not pass through kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Petr Mladek Tested-by: Michael Kelley # Hyper-V Link: https://lore.kernel.org/r/20230519102715.233598176@infradead.org --- kernel/printk/printk.c | 2 +- kernel/time/sched_clock.c | 2 +- kernel/time/timekeeping.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6a333adce3b3..357a4d18f638 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,7 +528,7 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls) seq = raw_read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; - } while (read_seqcount_latch_retry(&ls->latch, seq)); + } while (raw_read_seqcount_latch_retry(&ls->latch, seq)); return val; } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8464c5acc913..e8f2fb09a214 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -77,7 +77,7 @@ notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) notrace int sched_clock_read_retry(unsigned int seq) { - return read_seqcount_latch_retry(&cd.seq, seq); + return raw_read_seqcount_latch_retry(&cd.seq, seq); } unsigned long long notrace sched_clock(void) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 09d594900ee0..266d02809dbb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -450,7 +450,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -566,7 +566,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) *mono = basem + delta; -- cgit v1.2.3 From 5949a68c73444d89b171703b67ff04fc4d6059c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2023 12:21:00 +0200 Subject: time/sched_clock: Provide sched_clock_noinstr() With the intent to provide local_clock_noinstr(), a variant of local_clock() that's safe to be called from noinstr code (with the assumption that any such code will already be non-preemptible), prepare for things by providing a noinstr sched_clock_noinstr() function. Specifically, preempt_enable_*() calls out to schedule(), which upsets noinstr validation efforts. As such, pull out the preempt_{dis,en}able_notrace() requirements from the sched_clock_read() implementations by explicitly providing it in the sched_clock() function. This further requires said sched_clock_read() functions to be noinstr themselves, for ARCH_WANTS_NO_INSTR users. See the next few patches. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley # Hyper-V Link: https://lore.kernel.org/r/20230519102715.302350330@infradead.org --- kernel/time/sched_clock.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index e8f2fb09a214..68d6c1190ac7 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -64,7 +64,7 @@ static struct clock_data cd ____cacheline_aligned = { .actual_read_sched_clock = jiffy_sched_clock_read, }; -static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } @@ -80,23 +80,33 @@ notrace int sched_clock_read_retry(unsigned int seq) return raw_read_seqcount_latch_retry(&cd.seq, seq); } -unsigned long long notrace sched_clock(void) +unsigned long long noinstr sched_clock_noinstr(void) { - u64 cyc, res; - unsigned int seq; struct clock_read_data *rd; + unsigned int seq; + u64 cyc, res; do { - rd = sched_clock_read_begin(&seq); + seq = raw_read_seqcount_latch(&cd.seq); + rd = cd.read_data + (seq & 1); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (sched_clock_read_retry(seq)); + } while (raw_read_seqcount_latch_retry(&cd.seq, seq)); return res; } +unsigned long long notrace sched_clock(void) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = sched_clock_noinstr(); + preempt_enable_notrace(); + return ns; +} + /* * Updating the data required to read the clock. * -- cgit v1.2.3 From fb7d4948c4da2dbd26da4b7ec76bbd2f19ff862a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2023 12:21:10 +0200 Subject: sched/clock: Provide local_clock_noinstr() Now that all ARCH_WANTS_NO_INSTR architectures (arm64, loongarch, s390, x86) provide sched_clock_noinstr(), use this to provide local_clock_noinstr(). This local_clock_noinstr() will be safe to use from noinstr code with the assumption that any such noinstr code is non-preemptible (it had better be, entry code will have IRQs disabled while __cpuidle must have preemption disabled). Specifically, preempt_enable_notrace(), a common part of many a sched_clock() implementation calls out to schedule() -- even though, per the above, it will never trigger -- which frustrates noinstr validation. vmlinux.o: warning: objtool: local_clock+0xb5: call to preempt_schedule_notrace_thunk() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley # Hyper-V Link: https://lore.kernel.org/r/20230519102715.978624636@infradead.org --- kernel/sched/clock.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b5cc2b53464d..5a575a0ba4e6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) s64 delta; again: - now = sched_clock(); + now = sched_clock_noinstr(); delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; @@ -293,22 +293,29 @@ again: return clock; } -noinstr u64 local_clock(void) +noinstr u64 local_clock_noinstr(void) { u64 clock; if (static_branch_likely(&__sched_clock_stable)) - return sched_clock() + __sched_clock_offset; + return sched_clock_noinstr() + __sched_clock_offset; if (!static_branch_likely(&sched_clock_running)) - return sched_clock(); + return sched_clock_noinstr(); - preempt_disable_notrace(); clock = sched_clock_local(this_scd()); - preempt_enable_notrace(); return clock; } + +u64 local_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = local_clock_noinstr(); + preempt_enable_notrace(); + return now; +} EXPORT_SYMBOL_GPL(local_clock); static notrace u64 sched_clock_remote(struct sched_clock_data *scd) -- cgit v1.2.3 From 3eb6d6ececca2fd566d717b37ab467c246f66be7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 15 May 2023 13:57:34 +0200 Subject: sched/fair: Refactor CPU utilization functions There is a lot of code duplication in cpu_util_next() & cpu_util_cfs(). Remove this by allowing cpu_util_next() to be called with p = NULL. Rename cpu_util_next() to cpu_util() since the '_next' suffix is no longer necessary to distinct cpu utilization related functions. Implement cpu_util_cfs(cpu) as cpu_util(cpu, p = NULL, -1). This will allow to code future related cpu util changes only in one place, namely in cpu_util(). Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20230515115735.296329-2-dietmar.eggemann@arm.com --- kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++++++------------ kernel/sched/sched.h | 47 +-------------------------------------- 2 files changed, 50 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df0ff905a4aa..09e3be2e0464 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7202,11 +7202,41 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/* - * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu - * (@dst_cpu = -1) or migrated to @dst_cpu. - */ -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +/** + * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. + * @cpu: the CPU to get the utilization for + * @p: task for which the CPU utilization should be predicted or NULL + * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL + * + * The unit of the return value must be the same as the one of CPU capacity + * so that CPU utilization can be compared with CPU capacity. + * + * CPU utilization is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on that CPU. + * It represents the amount of CPU capacity currently used by CFS tasks in + * the range [0..max CPU capacity] with max CPU capacity being the CPU + * capacity at f_max. + * + * The estimated CPU utilization is defined as the maximum between CPU + * utilization and sum of the estimated utilization of the currently + * runnable tasks on that CPU. It preserves a utilization "snapshot" of + * previously-executed tasks, which helps better deduce how busy a CPU will + * be when a long-sleeping task wakes up. The contribution to CPU utilization + * of such a task would be significantly decayed at this point of time. + * + * CPU utilization can be higher than the current CPU capacity + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because + * of rounding errors as well as task migrations or wakeups of new tasks. + * CPU utilization has to be capped to fit into the [0..max CPU capacity] + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) + * could be seen as over-utilized even though CPU1 has 20% of spare CPU + * capacity. CPU utilization is allowed to overshoot current CPU capacity + * though since this is useful for predicting the CPU capacity required + * after task migrations (scheduler-driven DVFS). + * + * Return: (Estimated) utilization for the specified CPU. + */ +static unsigned long cpu_util(int cpu, struct task_struct *p, int dst_cpu) { struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); @@ -7217,9 +7247,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) * contribution. In all the other cases @cpu is not impacted by the * migration so its util_avg is already correct. */ - if (task_cpu(p) == cpu && dst_cpu != cpu) + if (p && task_cpu(p) == cpu && dst_cpu != cpu) lsub_positive(&util, task_util(p)); - else if (task_cpu(p) != cpu && dst_cpu == cpu) + else if (p && task_cpu(p) != cpu && dst_cpu == cpu) util += task_util(p); if (sched_feat(UTIL_EST)) { @@ -7255,7 +7285,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) */ if (dst_cpu == cpu) util_est += _task_util_est(p); - else if (unlikely(task_on_rq_queued(p) || current == p)) + else if (p && unlikely(task_on_rq_queued(p) || current == p)) lsub_positive(&util_est, _task_util_est(p)); util = max(util, util_est); @@ -7264,6 +7294,11 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) return min(util, capacity_orig_of(cpu)); } +unsigned long cpu_util_cfs(int cpu) +{ + return cpu_util(cpu, NULL, -1); +} + /* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested @@ -7281,9 +7316,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util_cfs(cpu); + p = NULL; - return cpu_util_next(cpu, p, -1); + return cpu_util(cpu, p, -1); } /* @@ -7330,7 +7365,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv, * cpu_capacity. * * The contribution of the task @p for which we want to estimate the - * energy cost is removed (by cpu_util_next()) and must be calculated + * energy cost is removed (by cpu_util()) and must be calculated * separately (see eenv_task_busy_time). This ensures: * * - A stable PD utilization, no matter which CPU of that PD we want to place @@ -7351,7 +7386,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, int cpu; for_each_cpu(cpu, pd_cpus) { - unsigned long util = cpu_util_next(cpu, p, -1); + unsigned long util = cpu_util(cpu, p, -1); busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); } @@ -7375,7 +7410,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; - unsigned long util = cpu_util_next(cpu, p, dst_cpu); + unsigned long util = cpu_util(cpu, p, dst_cpu); unsigned long cpu_util; /* @@ -7521,7 +7556,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - util = cpu_util_next(cpu, p, cpu); + util = cpu_util(cpu, p, cpu); cpu_cap = capacity_of(cpu); /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d8ba81c66579..aaf6fc2df6ff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2955,53 +2955,8 @@ static inline unsigned long cpu_util_dl(struct rq *rq) return READ_ONCE(rq->avg_dl.util_avg); } -/** - * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks. - * @cpu: the CPU to get the utilization for. - * - * The unit of the return value must be the same as the one of CPU capacity - * so that CPU utilization can be compared with CPU capacity. - * - * CPU utilization is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on that CPU. - * It represents the amount of CPU capacity currently used by CFS tasks in - * the range [0..max CPU capacity] with max CPU capacity being the CPU - * capacity at f_max. - * - * The estimated CPU utilization is defined as the maximum between CPU - * utilization and sum of the estimated utilization of the currently - * runnable tasks on that CPU. It preserves a utilization "snapshot" of - * previously-executed tasks, which helps better deduce how busy a CPU will - * be when a long-sleeping task wakes up. The contribution to CPU utilization - * of such a task would be significantly decayed at this point of time. - * - * CPU utilization can be higher than the current CPU capacity - * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because - * of rounding errors as well as task migrations or wakeups of new tasks. - * CPU utilization has to be capped to fit into the [0..max CPU capacity] - * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) - * could be seen as over-utilized even though CPU1 has 20% of spare CPU - * capacity. CPU utilization is allowed to overshoot current CPU capacity - * though since this is useful for predicting the CPU capacity required - * after task migrations (scheduler-driven DVFS). - * - * Return: (Estimated) utilization for the specified CPU. - */ -static inline unsigned long cpu_util_cfs(int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned long util; - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - if (sched_feat(UTIL_EST)) { - util = max_t(unsigned long, util, - READ_ONCE(cfs_rq->avg.util_est.enqueued)); - } - - return min(util, capacity_orig_of(cpu)); -} +extern unsigned long cpu_util_cfs(int cpu); static inline unsigned long cpu_util_rt(struct rq *rq) { -- cgit v1.2.3 From 7d0583cf9ec7bf8e5897dc7d3a7059e8fae5464a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 15 May 2023 13:57:35 +0200 Subject: sched/fair, cpufreq: Introduce 'runnable boosting' The responsiveness of the Per Entity Load Tracking (PELT) util_avg in mobile devices is still considered too low for utilization changes during task ramp-up. In Android this manifests in the fact that the first frames of a UI activity are very prone to be jankframes (a frame which doesn't meet the required frame rendering time, e.g. 16ms@60Hz) since the CPU frequency is normally low at this point and has to ramp up quickly. The beginning of an UI activity is also characterized by the occurrence of CPU contention, especially on little CPUs. Current little CPUs can have an original CPU capacity of only ~ 150 which means that the actual CPU capacity at lower frequency can even be much smaller. Schedutil maps CPU util_avg into CPU frequency request via: util = effective_cpu_util(..., cpu_util_cfs(cpu), ...) -> util = map_util_perf(util) -> freq = map_util_freq(util, ...) CPU contention for CFS tasks can be detected by 'CPU runnable > CPU utililization' in cpu_util_cfs_boost() -> cpu_util(..., boost = 1). Schedutil uses 'runnable boosting' by calling cpu_util_cfs_boost(). To be in sync with schedutil's CPU frequency selection, Energy Aware Scheduling (EAS) also calls cpu_util(..., boost = 1) during max util detection. Moreover, 'runnable boosting' is also used in load-balance for busiest CPU selection when the migration type is 'migrate_util', i.e. only at sched domains which don't have the SD_SHARE_PKG_RESOURCES flag set. Suggested-by: Vincent Guittot Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20230515115735.296329-3-dietmar.eggemann@arm.com --- kernel/sched/cpufreq_schedutil.c | 3 ++- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 1 + 3 files changed, 33 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e3211455b203..4492608b7d7f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -155,10 +155,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, static void sugov_get_util(struct sugov_cpu *sg_cpu) { + unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu); struct rq *rq = cpu_rq(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, FREQUENCY_UTIL, NULL); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 09e3be2e0464..6189d1a45635 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7207,6 +7207,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * @cpu: the CPU to get the utilization for * @p: task for which the CPU utilization should be predicted or NULL * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL + * @boost: 1 to enable boosting, otherwise 0 * * The unit of the return value must be the same as the one of CPU capacity * so that CPU utilization can be compared with CPU capacity. @@ -7224,6 +7225,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * be when a long-sleeping task wakes up. The contribution to CPU utilization * of such a task would be significantly decayed at this point of time. * + * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). + * CPU contention for CFS tasks can be detected by CPU runnable > CPU + * utilization. Boosting is implemented in cpu_util() so that internal + * users (e.g. EAS) can use it next to external users (e.g. schedutil), + * latter via cpu_util_cfs_boost(). + * * CPU utilization can be higher than the current CPU capacity * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because * of rounding errors as well as task migrations or wakeups of new tasks. @@ -7234,12 +7241,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * though since this is useful for predicting the CPU capacity required * after task migrations (scheduler-driven DVFS). * - * Return: (Estimated) utilization for the specified CPU. + * Return: (Boosted) (estimated) utilization for the specified CPU. */ -static unsigned long cpu_util(int cpu, struct task_struct *p, int dst_cpu) +static unsigned long +cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) { struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long runnable; + + if (boost) { + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + util = max(util, runnable); + } /* * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its @@ -7257,6 +7271,9 @@ static unsigned long cpu_util(int cpu, struct task_struct *p, int dst_cpu) util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + if (boost) + util_est = max(util_est, runnable); + /* * During wake-up @p isn't enqueued yet and doesn't contribute * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. @@ -7296,7 +7313,12 @@ static unsigned long cpu_util(int cpu, struct task_struct *p, int dst_cpu) unsigned long cpu_util_cfs(int cpu) { - return cpu_util(cpu, NULL, -1); + return cpu_util(cpu, NULL, -1, 0); +} + +unsigned long cpu_util_cfs_boost(int cpu) +{ + return cpu_util(cpu, NULL, -1, 1); } /* @@ -7318,7 +7340,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) p = NULL; - return cpu_util(cpu, p, -1); + return cpu_util(cpu, p, -1, 0); } /* @@ -7386,7 +7408,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, int cpu; for_each_cpu(cpu, pd_cpus) { - unsigned long util = cpu_util(cpu, p, -1); + unsigned long util = cpu_util(cpu, p, -1, 0); busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); } @@ -7410,7 +7432,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; - unsigned long util = cpu_util(cpu, p, dst_cpu); + unsigned long util = cpu_util(cpu, p, dst_cpu, 1); unsigned long cpu_util; /* @@ -7556,7 +7578,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - util = cpu_util(cpu, p, cpu); + util = cpu_util(cpu, p, cpu, 0); cpu_cap = capacity_of(cpu); /* @@ -10607,7 +10629,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, break; case migrate_util: - util = cpu_util_cfs(i); + util = cpu_util_cfs_boost(i); /* * Don't try to pull utilization from a CPU with one diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aaf6fc2df6ff..556496c77dc2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2957,6 +2957,7 @@ static inline unsigned long cpu_util_dl(struct rq *rq) extern unsigned long cpu_util_cfs(int cpu); +extern unsigned long cpu_util_cfs_boost(int cpu); static inline unsigned long cpu_util_rt(struct rq *rq) { -- cgit v1.2.3 From a49a11dc6449a7e1370441bc23d83d528787066d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 27 May 2023 17:33:53 +0800 Subject: cgroup: remove unused macro for_each_e_css() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit for_each_e_css() is unused now. Remove it. Signed-off-by: Miaohe Lin Reviewed-by: Yosry Ahmed Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f329f49529a2..eb3d9c1f25e3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -690,21 +690,6 @@ EXPORT_SYMBOL_GPL(of_css); lockdep_is_held(&cgroup_mutex)))) { } \ else -/** - * for_each_e_css - iterate all effective css's of a cgroup - * @css: the iteration cursor - * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end - * @cgrp: the target cgroup to iterate css's of - * - * Should be called under cgroup_[tree_]mutex. - */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css_by_mask(cgrp, \ - cgroup_subsys[(ssid)]))) \ - ; \ - else - /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor -- cgit v1.2.3 From 7bf11e90a30afcf79ff1acf28b01c6455799861a Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Fri, 2 Jun 2023 15:43:46 +0800 Subject: cgroup: Replace the css_set call with cgroup_get We will release the refcnt of cgroup via cgroup_put, for example in the cgroup_lock_and_drain_offline function, so replace css_get with the cgroup_get function for better readability. Signed-off-by: Gaosheng Cui Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index eb3d9c1f25e3..f69e9a443ecc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -619,7 +619,7 @@ EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); + cgroup_get(cgrp); } /** -- cgit v1.2.3 From c20d4d889fc65bedd52430710ff7428bf32944fd Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Fri, 2 Jun 2023 15:44:56 +0800 Subject: rdmacg: fix kernel-doc warnings in rdmacg Fix all kernel-doc warnings in rdmacg: kernel/cgroup/rdma.c:209: warning: Function parameter or member 'cg' not described in 'rdmacg_uncharge_hierarchy' kernel/cgroup/rdma.c:230: warning: Function parameter or member 'cg' not described in 'rdmacg_uncharge' Signed-off-by: Gaosheng Cui Signed-off-by: Tejun Heo --- kernel/cgroup/rdma.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 3135406608c7..ef5878fb2005 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -197,6 +197,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg, /** * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count + * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup * stop uncharging @@ -221,6 +222,7 @@ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, /** * rdmacg_uncharge - hierarchically uncharge rdma resource count + * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @index: index of the resource to uncharge in cgroup in given resource pool */ -- cgit v1.2.3 From 2140a6e3422de22e6ebe77d4d18b6c0c9c425426 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 1 Jun 2023 19:26:40 -0700 Subject: bpf: Set kptr_struct_meta for node param to list and rbtree insert funcs In verifier.c, fixup_kfunc_call uses struct bpf_insn_aux_data's kptr_struct_meta field to pass information about local kptr types to various helpers and kfuncs at runtime. The recent bpf_refcount series added a few functions to the set that need this information: * bpf_refcount_acquire * Needs to know where the refcount field is in order to increment * Graph collection insert kfuncs: bpf_rbtree_add, bpf_list_push_{front,back} * Were migrated to possibly fail by the bpf_refcount series. If insert fails, the input node is bpf_obj_drop'd. bpf_obj_drop needs the kptr_struct_meta in order to decr refcount and properly free special fields. Unfortunately the verifier handling of collection insert kfuncs was not modified to actually populate kptr_struct_meta. Accordingly, when the node input to those kfuncs is passed to bpf_obj_drop, it is done so without the information necessary to decr refcount. This patch fixes the issue by populating kptr_struct_meta for those kfuncs. Fixes: d2dcc67df910 ("bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to possibly fail") Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230602022647.1571784-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 086b2a14905b..34e56af5b0bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10475,6 +10475,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, node_off, btf_name_by_offset(reg->btf, t->name_off)); return -EINVAL; } + meta->arg_btf = reg->btf; + meta->arg_btf_id = reg->btf_id; if (node_off != field->graph_root.node_offset) { verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n", @@ -11044,6 +11046,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; insn_aux->insert_off = regs[BPF_REG_2].off; + insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", -- cgit v1.2.3 From cc0d76cafebbd3e1ffab9c4252d48ecc9e0737f6 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 1 Jun 2023 19:26:41 -0700 Subject: bpf: Fix __bpf_{list,rbtree}_add's beginning-of-node calculation Given the pointer to struct bpf_{rb,list}_node within a local kptr and the byte offset of that field within the kptr struct, the calculation changed by this patch is meant to find the beginning of the kptr so that it can be passed to bpf_obj_drop. Unfortunately instead of doing ptr_to_kptr = ptr_to_node_field - offset_bytes the calculation is erroneously doing ptr_to_ktpr = ptr_to_node_field - (offset_bytes * sizeof(struct bpf_rb_node)) or the bpf_list_node equivalent. This patch fixes the calculation. Fixes: d2dcc67df910 ("bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to possibly fail") Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230602022647.1571784-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4ef4c4f8a355..a4e437eabcb4 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1950,7 +1950,7 @@ static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head INIT_LIST_HEAD(h); if (!list_empty(n)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl(n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; } @@ -2032,7 +2032,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, if (!RB_EMPTY_NODE(n)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl(n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; } -- cgit v1.2.3 From 7793fc3babe9fea908e57f7c187ea819f9fd7e95 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 1 Jun 2023 19:26:42 -0700 Subject: bpf: Make bpf_refcount_acquire fallible for non-owning refs This patch fixes an incorrect assumption made in the original bpf_refcount series [0], specifically that the BPF program calling bpf_refcount_acquire on some node can always guarantee that the node is alive. In that series, the patch adding failure behavior to rbtree_add and list_push_{front, back} breaks this assumption for non-owning references. Consider the following program: n = bpf_kptr_xchg(&mapval, NULL); /* skip error checking */ bpf_spin_lock(&l); if(bpf_rbtree_add(&t, &n->rb, less)) { bpf_refcount_acquire(n); /* Failed to add, do something else with the node */ } bpf_spin_unlock(&l); It's incorrect to assume that bpf_refcount_acquire will always succeed in this scenario. bpf_refcount_acquire is being called in a critical section here, but the lock being held is associated with rbtree t, which isn't necessarily the lock associated with the tree that the node is already in. So after bpf_rbtree_add fails to add the node and calls bpf_obj_drop in it, the program has no ownership of the node's lifetime. Therefore the node's refcount can be decr'd to 0 at any time after the failing rbtree_add. If this happens before the refcount_acquire above, the node might be free'd, and regardless refcount_acquire will be incrementing a 0 refcount. Later patches in the series exercise this scenario, resulting in the expected complaint from the kernel (without this patch's changes): refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 207 at lib/refcount.c:25 refcount_warn_saturate+0xbc/0x110 Modules linked in: bpf_testmod(O) CPU: 1 PID: 207 Comm: test_progs Tainted: G O 6.3.0-rc7-02231-g723de1a718a2-dirty #371 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:refcount_warn_saturate+0xbc/0x110 Code: 6f 64 f6 02 01 e8 84 a3 5c ff 0f 0b eb 9d 80 3d 5e 64 f6 02 00 75 94 48 c7 c7 e0 13 d2 82 c6 05 4e 64 f6 02 01 e8 64 a3 5c ff <0f> 0b e9 7a ff ff ff 80 3d 38 64 f6 02 00 0f 85 6d ff ff ff 48 c7 RSP: 0018:ffff88810b9179b0 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 RDX: 0000000000000202 RSI: 0000000000000008 RDI: ffffffff857c3680 RBP: ffff88810027d3c0 R08: ffffffff8125f2a4 R09: ffff88810b9176e7 R10: ffffed1021722edc R11: 746e756f63666572 R12: ffff88810027d388 R13: ffff88810027d3c0 R14: ffffc900005fe030 R15: ffffc900005fe048 FS: 00007fee0584a700(0000) GS:ffff88811b280000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005634a96f6c58 CR3: 0000000108ce9002 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: bpf_refcount_acquire_impl+0xb5/0xc0 (rest of output snipped) The patch addresses this by changing bpf_refcount_acquire_impl to use refcount_inc_not_zero instead of refcount_inc and marking bpf_refcount_acquire KF_RET_NULL. For owning references, though, we know the above scenario is not possible and thus that bpf_refcount_acquire will always succeed. Some verifier bookkeeping is added to track "is input owning ref?" for bpf_refcount_acquire calls and return false from is_kfunc_ret_null for bpf_refcount_acquire on owning refs despite it being marked KF_RET_NULL. Existing selftests using bpf_refcount_acquire are modified where necessary to NULL-check its return value. [0]: https://lore.kernel.org/bpf/20230415201811.343116-1-davemarchevsky@fb.com/ Fixes: d2dcc67df910 ("bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to possibly fail") Reported-by: Kumar Kartikeya Dwivedi Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230602022647.1571784-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 8 ++++++-- kernel/bpf/verifier.c | 26 ++++++++++++++++++-------- 2 files changed, 24 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a4e437eabcb4..9e80efa59a5d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1933,8 +1933,12 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta * bpf_refcount type so that it is emitted in vmlinux BTF */ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); + if (!refcount_inc_not_zero((refcount_t *)ref)) + return NULL; - refcount_inc((refcount_t *)ref); + /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null + * in verifier.c + */ return (void *)p__refcounted_kptr; } @@ -2406,7 +2410,7 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34e56af5b0bc..27b54266b4c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -298,16 +298,19 @@ struct bpf_kfunc_call_arg_meta { bool found; } arg_constant; - /* arg_btf and arg_btf_id are used by kfunc-specific handling, + /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, * generally to pass info about user-defined local kptr types to later * verification logic * bpf_obj_drop * Record the local kptr type to be drop'd * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) - * Record the local kptr type to be refcount_incr'd + * Record the local kptr type to be refcount_incr'd and use + * arg_owning_ref to determine whether refcount_acquire should be + * fallible */ struct btf *arg_btf; u32 arg_btf_id; + bool arg_owning_ref; struct { struct btf_field *field; @@ -9678,11 +9681,6 @@ static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_ACQUIRE; } -static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_RET_NULL; -} - static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_RELEASE; @@ -9998,6 +9996,16 @@ BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) +{ + if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && + meta->arg_owning_ref) { + return false; + } + + return meta->kfunc_flags & KF_RET_NULL; +} + static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock]; @@ -10880,10 +10888,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: - if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) { + if (!type_is_ptr_alloc_obj(reg->type)) { verbose(env, "arg#%d is neither owning or non-owning ref\n", i); return -EINVAL; } + if (!type_is_non_owning_ref(reg->type)) + meta->arg_owning_ref = true; rec = reg_btf_record(reg); if (!rec) { -- cgit v1.2.3 From 0dad9b072b2b170a99fcefa330a1a3193d503d8c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 3 Jun 2023 15:13:04 +0800 Subject: cgroup: make cgroup_is_threaded() and cgroup_is_thread_root() static They're only called inside cgroup.c. Make them static. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 2 -- kernel/cgroup/cgroup.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 367b0a42ada9..c56071f150f2 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -220,8 +220,6 @@ static inline void get_css_set(struct css_set *cset) bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); -bool cgroup_is_thread_root(struct cgroup *cgrp); -bool cgroup_is_threaded(struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f69e9a443ecc..57f31b234433 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -357,7 +357,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp) return cgrp->nr_populated_csets; } -bool cgroup_is_threaded(struct cgroup *cgrp) +static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } @@ -396,7 +396,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) } /* is @cgrp root of a threaded subtree? */ -bool cgroup_is_thread_root(struct cgroup *cgrp) +static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) -- cgit v1.2.3 From 503e4def5414fd0f9b6ffecb6eedbc4b1603693b Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sat, 27 May 2023 21:27:06 +0900 Subject: bpf: Replace open code with for allocated object check >From commit 282de143ead9 ("bpf: Introduce allocated objects support"), With this allocated object with BPF program, (PTR_TO_BTF_ID | MEM_ALLOC) has been a way of indicating to check the type is the allocated object. commit d8939cb0a03c ("bpf: Loosen alloc obj test in verifier's reg_btf_record") >From the commit, there has been helper function for checking this, named type_is_ptr_alloc_obj(). But still, some of the code use open code to retrieve this info. This commit replaces the open code with the type_is_alloc(), and the type_is_ptr_alloc_obj() function. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230527122706.59315-1-danieltimlee@gmail.com --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 27b54266b4c7..7acbd103f9ac 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5894,7 +5894,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * program allocated objects (which always have ref_obj_id > 0), * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. */ - if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) { verbose(env, "only read is supported\n"); return -EACCES; } @@ -7514,7 +7514,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (base_type(arg_type) == ARG_PTR_TO_MEM) type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC) + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) type &= ~MEM_ALLOC; for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { -- cgit v1.2.3 From 51302c951c8fd5c298565c7127c855bf1d4550b6 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 2 Jun 2023 10:01:11 -0500 Subject: bpf: Teach verifier that trusted PTR_TO_BTF_ID pointers are non-NULL In reg_type_not_null(), we currently assume that a pointer may be NULL if it has the PTR_MAYBE_NULL modifier, or if it doesn't belong to one of several base type of pointers that are never NULL-able. For example, PTR_TO_CTX, PTR_TO_MAP_VALUE, etc. It turns out that in some cases, PTR_TO_BTF_ID can never be NULL as well, though we currently don't specify it. For example, if you had the following program: SEC("tc") long example_refcnt_fail(void *ctx) { struct bpf_cpumask *mask1, *mask2; mask1 = bpf_cpumask_create(); mask2 = bpf_cpumask_create(); if (!mask1 || !mask2) goto error_release; bpf_cpumask_test_cpu(0, (const struct cpumask *)mask1); bpf_cpumask_test_cpu(0, (const struct cpumask *)mask2); error_release: if (mask1) bpf_cpumask_release(mask1); if (mask2) bpf_cpumask_release(mask2); return ret; } The verifier will incorrectly fail to load the program, thinking (unintuitively) that we have a possibly-unreleased reference if the mask is NULL, because we (correctly) don't issue a bpf_cpumask_release() on the NULL path. The reason the verifier gets confused is due to the fact that we don't explicitly tell the verifier that trusted PTR_TO_BTF_ID pointers can never be NULL. Basically, if we successfully get past the if check (meaning both pointers go from ptr_or_null_bpf_cpumask to ptr_bpf_cpumask), the verifier will correctly assume that the references need to be dropped on any possible branch that leads to program exit. However, it will _incorrectly_ think that the ptr == NULL branch is possible, and will erroneously detect it as a branch on which we failed to drop the reference. The solution is of course to teach the verifier that trusted PTR_TO_BTF_ID pointers can never be NULL, so that it doesn't incorrectly think it's possible for the reference to be present on the ptr == NULL branch. A follow-on patch will add a selftest that verifies this behavior. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230602150112.1494194-1-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7acbd103f9ac..1e38584d497c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -197,6 +197,7 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); static void specialize_kfunc(struct bpf_verifier_env *env, u32 func_id, u16 offset, unsigned long *addr); +static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { @@ -442,8 +443,11 @@ static bool type_may_be_null(u32 type) return type & PTR_MAYBE_NULL; } -static bool reg_type_not_null(enum bpf_reg_type type) +static bool reg_not_null(const struct bpf_reg_state *reg) { + enum bpf_reg_type type; + + type = reg->type; if (type_may_be_null(type)) return false; @@ -453,6 +457,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || + (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || type == PTR_TO_MEM; } @@ -13170,7 +13175,7 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { if (__is_pointer_value(false, reg)) { - if (!reg_type_not_null(reg->type)) + if (!reg_not_null(reg)) return -1; /* If pointer is valid tests against zero will fail so we can -- cgit v1.2.3 From 228020b490eda9133c9cb6f59a5ee1278d8c463f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Jun 2023 12:14:01 +0200 Subject: perf: Re-instate the linear PMU search Full revert of commit 9551fbb64d09 ("perf/core: Remove pmu linear searching code"). Some architectures (notably arm/arm64) still relied on the linear search in order to find the PMU that consumes PERF_TYPE_{HARDWARE,HW_CACHE,RAW}. This will need a more thorought audit and cleanup. Reported-by: Nathan Chancellor Reported-by: Krzysztof Kozlowski Acked-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230605101401.GL38236@hirez.programming.kicks-ass.net --- kernel/events/core.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 231b187c1a3f..c01bbe93e291 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11630,27 +11630,38 @@ static struct pmu *perf_init_event(struct perf_event *event) } again: - ret = -ENOENT; rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); - if (!pmu) - goto fail; + if (pmu) { + if (event->attr.type != type && type != PERF_TYPE_RAW && + !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) + goto fail; - if (event->attr.type != type && type != PERF_TYPE_RAW && - !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) - goto fail; + ret = perf_try_init_event(pmu, event); + if (ret == -ENOENT && event->attr.type != type && !extended_type) { + type = event->attr.type; + goto again; + } - ret = perf_try_init_event(pmu, event); - if (ret == -ENOENT && event->attr.type != type && !extended_type) { - type = event->attr.type; - goto again; + if (ret) + pmu = ERR_PTR(ret); + + goto unlock; } -fail: - if (ret) - pmu = ERR_PTR(ret); + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + ret = perf_try_init_event(pmu, event); + if (!ret) + goto unlock; + if (ret != -ENOENT) { + pmu = ERR_PTR(ret); + goto unlock; + } + } +fail: + pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); -- cgit v1.2.3 From 943211c87427f25bd22e0e63849fb486bb5f87fa Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Mon, 5 Jun 2023 20:06:16 +0530 Subject: watch_queue: prevent dangling pipe pointer NULL the dangling pipe reference while clearing watch_queue. If not done, a reference to a freed pipe remains in the watch_queue, as this function is called before freeing a pipe in free_pipe_info() (see line 834 of fs/pipe.c). The sole use of wqueue->defunct is for checking if the watch queue has been cleared, but wqueue->pipe is also NULLed while clearing. Thus, wqueue->defunct is superfluous, as wqueue->pipe can be checked for NULL. Hence, the former can be removed. Tested with keyutils testsuite. Cc: stable@vger.kernel.org # 6.1 Signed-off-by: Siddh Raman Pant Acked-by: David Howells Message-Id: <20230605143616.640517-1-code@siddh.me> Signed-off-by: Christian Brauner --- kernel/watch_queue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index e91cb4c2833f..d0b6b390ee42 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -42,7 +42,7 @@ MODULE_AUTHOR("Red Hat, Inc."); static inline bool lock_wqueue(struct watch_queue *wqueue) { spin_lock_bh(&wqueue->lock); - if (unlikely(wqueue->defunct)) { + if (unlikely(!wqueue->pipe)) { spin_unlock_bh(&wqueue->lock); return false; } @@ -104,9 +104,6 @@ static bool post_one_notification(struct watch_queue *wqueue, unsigned int head, tail, mask, note, offset, len; bool done = false; - if (!pipe) - return false; - spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; @@ -603,8 +600,11 @@ void watch_queue_clear(struct watch_queue *wqueue) rcu_read_lock(); spin_lock_bh(&wqueue->lock); - /* Prevent new notifications from being stored. */ - wqueue->defunct = true; + /* + * This pipe can be freed by callers like free_pipe_info(). + * Removing this reference also prevents new notifications. + */ + wqueue->pipe = NULL; while (!hlist_empty(&wqueue->watches)) { watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); -- cgit v1.2.3 From cb16330d12741f6dae56aad5acf62f5be3a06c4e Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:55 +0900 Subject: fprobe: Pass return address to the handlers Pass return address as 'ret_ip' to the fprobe entry and return handlers so that the fprobe user handler can get the reutrn address without analyzing arch-dependent pt_regs. Link: https://lore.kernel.org/all/168507467664.913472.11642316698862778600.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 1 + kernel/trace/bpf_trace.c | 6 ++++-- kernel/trace/fprobe.c | 6 +++--- kernel/trace/rethook.c | 3 ++- 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 00e177de91cc..ce13f1a35251 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2127,6 +2127,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) NOKPROBE_SYMBOL(pre_handler_kretprobe); static void kretprobe_rethook_handler(struct rethook_node *rh, void *data, + unsigned long ret_addr, struct pt_regs *regs) { struct kretprobe *rp = (struct kretprobe *)data; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9a050e36dc6c..987c76d94604 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2642,7 +2642,8 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, struct pt_regs *regs, + void *data) { struct bpf_kprobe_multi_link *link; @@ -2653,7 +2654,8 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, struct pt_regs *regs, + void *data) { struct bpf_kprobe_multi_link *link; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 18d36842faf5..32994815edf6 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -46,7 +46,7 @@ static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, } if (fp->entry_handler) - ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data); + ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); /* If entry_handler returns !0, nmissed is not counted. */ if (rh) { @@ -112,7 +112,7 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, } static void fprobe_exit_handler(struct rethook_node *rh, void *data, - struct pt_regs *regs) + unsigned long ret_ip, struct pt_regs *regs) { struct fprobe *fp = (struct fprobe *)data; struct fprobe_rethook_node *fpr; @@ -133,7 +133,7 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data, return; } - fp->exit_handler(fp, fpr->entry_ip, regs, + fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, fp->entry_data_size ? (void *)fpr->data : NULL); ftrace_test_recursion_unlock(bit); } diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 60f6cb2b486b..f32ee484391a 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -301,7 +301,8 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, break; handler = READ_ONCE(rhn->rethook->handler); if (handler) - handler(rhn, rhn->rethook->data, regs); + handler(rhn, rhn->rethook->data, + correct_ret_addr, regs); if (first == node) break; -- cgit v1.2.3 From 30460c21ed40a10bf541c4e93ba5e80bb4aac5da Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:55 +0900 Subject: tracing/probes: Avoid setting TPARG_FL_FENTRY and TPARG_FL_RETURN When parsing a kprobe event, the return probe always sets both TPARG_FL_RETURN and TPARG_FL_FENTRY, but this is not useful because some fetchargs are only for return probe and some others only for function entry. Make it obviously mutual exclusive. Link: https://lore.kernel.org/all/168507468731.913472.11354553441385410734.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 59cda19a9033..867ffb7ee31d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -825,7 +825,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (is_return) flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); - if (ret == 0) + if (ret == 0 && !is_return) flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 6a4ecfb1da43..5df59714f9f5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -357,6 +357,11 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a #define trace_probe_for_each_link_rcu(pos, tp) \ list_for_each_entry_rcu(pos, &(tp)->event->files, list) +/* + * The flags used for parsing trace_probe arguments. + * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TPOINT are mutually exclusive. + * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive. + */ #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -- cgit v1.2.3 From 334e5519c3757019cc591d4539d5aca199bdb114 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:55 +0900 Subject: tracing/probes: Add fprobe events for tracing function entry and exit. Add fprobe events for tracing function entry and exit instead of kprobe events. With this change, we can continue to trace function entry/exit even if the CONFIG_KPROBES_ON_FTRACE is not available. Since CONFIG_KPROBES_ON_FTRACE requires the CONFIG_DYNAMIC_FTRACE_WITH_REGS, it is not available if the architecture only supports CONFIG_DYNAMIC_FTRACE_WITH_ARGS. And that means kprobe events can not probe function entry/exit effectively on such architecture. But this can be solved if the dynamic events supports fprobe events. The fprobe event is a new dynamic events which is only for the function (symbol) entry and exit. This event accepts non register fetch arguments so that user can trace the function arguments and return values. The fprobe events syntax is here; f[:[GRP/][EVENT]] FUNCTION [FETCHARGS] f[MAXACTIVE][:[GRP/][EVENT]] FUNCTION%return [FETCHARGS] E.g. # echo 'f vfs_read $arg1' >> dynamic_events # echo 'f vfs_read%return $retval' >> dynamic_events # cat dynamic_events f:fprobes/vfs_read__entry vfs_read arg1=$arg1 f:fprobes/vfs_read__exit vfs_read%return arg1=$retval # echo 1 > events/fprobes/enable # head -n 20 trace | tail # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | sh-142 [005] ...1. 448.386420: vfs_read__entry: (vfs_read+0x4/0x340) arg1=0xffff888007f7c540 sh-142 [005] ..... 448.386436: vfs_read__exit: (ksys_read+0x75/0x100 <- vfs_read) arg1=0x1 sh-142 [005] ...1. 448.386451: vfs_read__entry: (vfs_read+0x4/0x340) arg1=0xffff888007f7c540 sh-142 [005] ..... 448.386458: vfs_read__exit: (ksys_read+0x75/0x100 <- vfs_read) arg1=0x1 sh-142 [005] ...1. 448.386469: vfs_read__entry: (vfs_read+0x4/0x340) arg1=0xffff888007f7c540 sh-142 [005] ..... 448.386476: vfs_read__exit: (ksys_read+0x75/0x100 <- vfs_read) arg1=0x1 sh-142 [005] ...1. 448.602073: vfs_read__entry: (vfs_read+0x4/0x340) arg1=0xffff888007f7c540 sh-142 [005] ..... 448.602089: vfs_read__exit: (ksys_read+0x75/0x100 <- vfs_read) arg1=0x1 Link: https://lore.kernel.org/all/168507469754.913472.6112857614708350210.stgit@mhiramat.roam.corp.google.com/ Reported-by: kernel test robot Link: https://lore.kernel.org/all/202302011530.7vm4O8Ro-lkp@intel.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/Kconfig | 14 + kernel/trace/Makefile | 1 + kernel/trace/fprobe.c | 11 +- kernel/trace/trace.c | 8 +- kernel/trace/trace.h | 11 + kernel/trace/trace_fprobe.c | 1053 +++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 4 +- kernel/trace/trace_probe.h | 3 +- 9 files changed, 1100 insertions(+), 7 deletions(-) create mode 100644 kernel/trace/trace_fprobe.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8cf97fa4a4b3..8e10a9453c96 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -650,6 +650,20 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config FPROBE_EVENTS + depends on FPROBE + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable fprobe-based dynamic events" + select TRACING + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + This allows user to add tracing events on the function entry and + exit via ftrace interface. The syntax is same as the kprobe events + and the kprobe events on function entry and exit will be + transparently converted to this fprobe events. + config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c6651e16b557..64b61f67a403 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 32994815edf6..e4704ec26df7 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -348,6 +348,14 @@ int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) } EXPORT_SYMBOL_GPL(register_fprobe_syms); +bool fprobe_is_registered(struct fprobe *fp) +{ + if (!fp || (fp->ops.saved_func != fprobe_handler && + fp->ops.saved_func != fprobe_kprobe_handler)) + return false; + return true; +} + /** * unregister_fprobe() - Unregister fprobe from ftrace * @fp: A fprobe data structure to be unregistered. @@ -360,8 +368,7 @@ int unregister_fprobe(struct fprobe *fp) { int ret; - if (!fp || (fp->ops.saved_func != fprobe_handler && - fp->ops.saved_func != fprobe_kprobe_handler)) + if (!fprobe_is_registered(fp)) return -EINVAL; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64a4dde073ef..755b0bf2e1ac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5672,10 +5672,16 @@ static const char readme_msg[] = " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \ + defined(CONFIG_FPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t Format: p[:[/][]] []\n" "\t r[maxactive][:[/][]] []\n" +#endif +#ifdef CONFIG_FPROBE_EVENTS + "\t f[:[/][]] [%return] []\n" +#endif #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..b5ab5479f9e3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -148,6 +148,17 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; +struct fentry_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct fexit_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; + #define TRACE_BUF_SIZE 1024 struct trace_array; diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c new file mode 100644 index 000000000000..48dbbc72b7dd --- /dev/null +++ b/kernel/trace/trace_fprobe.c @@ -0,0 +1,1053 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Fprobe-based tracing events + * Copyright (C) 2022 Google LLC. + */ +#define pr_fmt(fmt) "trace_fprobe: " fmt + +#include +#include +#include +#include +#include + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" + +#define FPROBE_EVENT_SYSTEM "fprobes" +#define RETHOOK_MAXACTIVE_MAX 4096 + +static int trace_fprobe_create(const char *raw_command); +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_fprobe_release(struct dyn_event *ev); +static bool trace_fprobe_is_busy(struct dyn_event *ev); +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations trace_fprobe_ops = { + .create = trace_fprobe_create, + .show = trace_fprobe_show, + .is_busy = trace_fprobe_is_busy, + .free = trace_fprobe_release, + .match = trace_fprobe_match, +}; + +/* + * Fprobe event core functions + */ +struct trace_fprobe { + struct dyn_event devent; + struct fprobe fp; + const char *symbol; + struct trace_probe tp; +}; + +static bool is_trace_fprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_fprobe_ops; +} + +static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_fprobe, devent); +} + +/** + * for_each_trace_fprobe - iterate over the trace_fprobe list + * @pos: the struct trace_fprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_fprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos))) + +static bool trace_fprobe_is_return(struct trace_fprobe *tf) +{ + return tf->fp.exit_handler != NULL; +} + +static const char *trace_fprobe_symbol(struct trace_fprobe *tf) +{ + return tf->symbol ? tf->symbol : "unknown"; +} + +static bool trace_fprobe_is_busy(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + return trace_probe_is_enabled(&tf->tp); +} + +static bool trace_fprobe_match_command_head(struct trace_fprobe *tf, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tf->tp, argc, argv); +} + +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event)) + return false; + + if (system && strcmp(trace_probe_group_name(&tf->tp), system)) + return false; + + return trace_fprobe_match_command_head(tf, argc, argv); +} + +static bool trace_fprobe_is_registered(struct trace_fprobe *tf) +{ + return fprobe_is_registered(&tf->fp); +} + +/* + * Note that we don't verify the fetch_insn code, since it does not come + * from user space. + */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + struct pt_regs *regs = rec; + unsigned long val; + int ret; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* function entry handler */ +static nokprobe_inline void +__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fentry_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->ip = entry_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fentry_trace_func(tf, entry_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fentry_trace_func); + +/* Kretprobe handler */ +static nokprobe_inline void +__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fexit_trace_entry_head *entry; + struct trace_event_buffer fbuffer; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fexit_trace_func); + +#ifdef CONFIG_PERF_EVENTS + +static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fentry_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return 0; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return 0; + + entry->ip = entry_ip; + memset(&entry[1], 0, dsize); + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); + return 0; +} +NOKPROBE_SYMBOL(fentry_perf_func); + +static void +fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fexit_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return; + + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); +} +NOKPROBE_SYMBOL(fexit_perf_func); +#endif /* CONFIG_PERF_EVENTS */ + +static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + int ret = 0; + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fentry_trace_func(tf, entry_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + ret = fentry_perf_func(tf, entry_ip, regs); +#endif + return ret; +} +NOKPROBE_SYMBOL(fentry_dispatcher); + +static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fexit_trace_func(tf, entry_ip, ret_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + fexit_perf_func(tf, entry_ip, ret_ip, regs); +#endif +} +NOKPROBE_SYMBOL(fexit_dispatcher); + +static void free_trace_fprobe(struct trace_fprobe *tf) +{ + if (tf) { + trace_probe_cleanup(&tf->tp); + kfree(tf->symbol); + kfree(tf); + } +} + +/* + * Allocate new trace_probe and initialize it (including fprobe). + */ +static struct trace_fprobe *alloc_trace_fprobe(const char *group, + const char *event, + const char *symbol, + int maxactive, + int nargs, bool is_return) +{ + struct trace_fprobe *tf; + int ret = -ENOMEM; + + tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); + if (!tf) + return ERR_PTR(ret); + + tf->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tf->symbol) + goto error; + + if (is_return) + tf->fp.exit_handler = fexit_dispatcher; + else + tf->fp.entry_handler = fentry_dispatcher; + + tf->fp.nr_maxactive = maxactive; + + ret = trace_probe_init(&tf->tp, event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&tf->devent, &trace_fprobe_ops); + return tf; +error: + free_trace_fprobe(tf); + return ERR_PTR(ret); +} + +static struct trace_fprobe *find_trace_fprobe(const char *event, + const char *group) +{ + struct dyn_event *pos; + struct trace_fprobe *tf; + + for_each_trace_fprobe(tf, pos) + if (strcmp(trace_probe_name(&tf->tp), event) == 0 && + strcmp(trace_probe_group_name(&tf->tp), group) == 0) + return tf; + return NULL; +} + +static inline int __enable_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) + enable_fprobe(&tf->fp); + + return 0; +} + +static void __disable_trace_fprobe(struct trace_probe *tp) +{ + struct trace_fprobe *tf; + + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + if (!trace_fprobe_is_registered(tf)) + continue; + disable_fprobe(&tf->fp); + } +} + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + /* TODO: check the fprobe is gone */ + __enable_trace_fprobe(tf); + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) + __disable_trace_fprobe(tp); + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* Event entry printers */ +static enum print_line_t +print_fentry_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fentry_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fentry_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static enum print_line_t +print_fexit_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fexit_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fexit_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_puts(s, " <- "); + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + +static int fentry_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fentry_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static int fexit_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fexit_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_functions fentry_funcs = { + .trace = print_fentry_event +}; + +static struct trace_event_functions fexit_funcs = { + .trace = print_fexit_event +}; + +static struct trace_event_fields fentry_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fentry_event_define_fields }, + {} +}; + +static struct trace_event_fields fexit_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fexit_event_define_fields }, + {} +}; + +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data); + +static inline void init_trace_event_call(struct trace_fprobe *tf) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + + if (trace_fprobe_is_return(tf)) { + call->event.funcs = &fexit_funcs; + call->class->fields_array = fexit_fields_array; + } else { + call->event.funcs = &fentry_funcs; + call->class->fields_array = fentry_fields_array; + } + + call->flags = TRACE_EVENT_FL_FPROBE; + call->class->reg = fprobe_register; +} + +static int register_fprobe_event(struct trace_fprobe *tf) +{ + init_trace_event_call(tf); + + return trace_probe_register_event_call(&tf->tp); +} + +static int unregister_fprobe_event(struct trace_fprobe *tf) +{ + return trace_probe_unregister_event_call(&tf->tp); +} + +/* Internal register function - just handle fprobe and flags */ +static int __register_trace_fprobe(struct trace_fprobe *tf) +{ + int i, ret; + + /* Should we need new LOCKDOWN flag for fprobe? */ + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + + if (trace_fprobe_is_registered(tf)) + return -EINVAL; + + for (i = 0; i < tf->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tf->tp.args[i]); + if (ret) + return ret; + } + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(&tf->tp)) + tf->fp.flags &= ~FPROBE_FL_DISABLED; + else + tf->fp.flags |= FPROBE_FL_DISABLED; + + /* TODO: handle filter, nofilter or symbol list */ + return register_fprobe(&tf->fp, tf->symbol, NULL); +} + +/* Internal unregister function - just handle fprobe and flags */ +static void __unregister_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) { + unregister_fprobe(&tf->fp); + memset(&tf->fp, 0, sizeof(tf->fp)); + } +} + +/* TODO: make this trace_*probe common function */ +/* Unregister a trace_probe and probe_event */ +static int unregister_trace_fprobe(struct trace_fprobe *tf) +{ + /* If other probes are on the event, just unregister fprobe */ + if (trace_probe_has_sibling(&tf->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&tf->tp)) + return -EBUSY; + + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tf->tp))) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_fprobe_event(tf)) + return -EBUSY; + +unreg: + __unregister_trace_fprobe(tf); + dyn_event_remove(&tf->devent); + trace_probe_unlink(&tf->tp); + + return 0; +} + +static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, + struct trace_fprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + int i; + + list_for_each_entry(orig, &tpe->probes, tp.list) { + if (strcmp(trace_fprobe_symbol(orig), + trace_fprobe_symbol(comp))) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +{ + int ret; + + if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + return -EEXIST; + } + ret = trace_probe_compare_arg_type(&tf->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_fprobe_has_same_fprobe(to, tf)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tf->tp, &to->tp); + if (ret) + return ret; + + ret = __register_trace_fprobe(tf); + if (ret) + trace_probe_unlink(&tf->tp); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + + return ret; +} + +/* Register a trace_probe and probe_event */ +static int register_trace_fprobe(struct trace_fprobe *tf) +{ + struct trace_fprobe *old_tf; + int ret; + + mutex_lock(&event_mutex); + + old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), + trace_probe_group_name(&tf->tp)); + if (old_tf) { + ret = append_trace_fprobe(tf, old_tf); + goto end; + } + + /* Register new event */ + ret = register_fprobe_event(tf); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); + goto end; + } + + /* Register fprobe */ + ret = __register_trace_fprobe(tf); + if (ret < 0) + unregister_fprobe_event(tf); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + +end: + mutex_unlock(&event_mutex); + return ret; +} + +static int __trace_fprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * - Add fentry probe: + * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS] + * - Add fexit probe: + * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS] + * + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth entry of stack (N:0-) + * $argN : fetch Nth argument (N:1-) + * $comm : fetch current task comm + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_fprobe *tf = NULL; + int i, len, ret = 0; + bool is_return = false; + char *symbol = NULL, *tmp = NULL; + const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + int maxactive = 0; + char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + + if (argv[0][0] != 'f' || argc < 2) + return -ECANCELED; + + trace_probe_log_init("trace_fprobe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) + event++; + + if (isdigit(argv[0][1])) { + if (event) + len = event - &argv[0][1] - 1; + else + len = strlen(&argv[0][1]); + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + memcpy(buf, &argv[0][1], len); + buf[len] = '\0'; + ret = kstrtouint(buf, 0, &maxactive); + if (ret || !maxactive) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + /* fprobe rethook instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > RETHOOK_MAXACTIVE_MAX) { + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; + } + } + + trace_probe_log_set_index(1); + + /* a symbol specified */ + symbol = kstrdup(argv[1], GFP_KERNEL); + if (!symbol) + return -ENOMEM; + + tmp = strchr(symbol, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); + goto parse_error; + } + } + if (!is_return && maxactive) { + trace_probe_log_set_index(0); + trace_probe_log_err(1, BAD_MAXACT_TYPE); + goto parse_error; + } + + if (is_return) + flags |= TPARG_FL_RETURN; + else + flags |= TPARG_FL_FENTRY; + + trace_probe_log_set_index(0); + if (event) { + ret = traceprobe_parse_event_name(&event, &group, gbuf, + event - argv[0]); + if (ret) + goto parse_error; + } + + if (!event) { + /* Make a new event name */ + snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + is_return ? "exit" : "entry"); + sanitize_event_name(buf); + event = buf; + } + + /* setup a probe */ + tf = alloc_trace_fprobe(group, event, symbol, maxactive, + argc - 2, is_return); + if (IS_ERR(tf)) { + ret = PTR_ERR(tf); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto out; /* We know tf is not allocated */ + } + argc -= 2; argv += 2; + + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], flags); + if (ret) + goto error; /* This can be -ENOMEM */ + } + + ret = traceprobe_set_print_fmt(&tf->tp, + is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); + if (ret < 0) + goto error; + + ret = register_trace_fprobe(tf); + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM && ret != -EEXIST) + trace_probe_log_err(0, FAIL_REG_PROBE); + goto error; + } + +out: + trace_probe_log_clear(); + kfree(symbol); + return ret; + +parse_error: + ret = -EINVAL; +error: + free_trace_fprobe(tf); + goto out; +} + +static int trace_fprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_fprobe_create); +} + +static int trace_fprobe_release(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int ret = unregister_trace_fprobe(tf); + + if (!ret) + free_trace_fprobe(tf); + return ret; +} + +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int i; + + seq_putc(m, 'f'); + if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) + seq_printf(m, "%d", tf->fp.nr_maxactive); + seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), + trace_probe_name(&tf->tp)); + + seq_printf(m, " %s%s", trace_fprobe_symbol(tf), + trace_fprobe_is_return(tf) ? "%return" : ""); + + for (i = 0; i < tf->tp.nr_args; i++) + seq_printf(m, " %s=%s", tf->tp.args[i].name, tf->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + */ +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_fprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_fprobe(event, file); + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return enable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_UNREGISTER: + return disable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup fprobe + * events in postcore_initcall without tracefs. + */ +static __init int init_fprobe_trace_early(void) +{ + int ret; + + ret = dyn_event_register(&trace_fprobe_ops); + if (ret) + return ret; + + return 0; +} +core_initcall(init_fprobe_trace_early); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 867ffb7ee31d..b7a4409674b3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -764,7 +764,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (isdigit(argv[0][1])) { if (!is_return) { - trace_probe_log_err(1, MAXACT_NO_KPROBE); + trace_probe_log_err(1, BAD_MAXACT_TYPE); goto parse_error; } if (event) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..c39860fb2e41 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -393,8 +393,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ - if (flags & TPARG_FL_TPOINT) { - /* eprobes do not handle registers */ + if (flags & (TPARG_FL_TPOINT | TPARG_FL_FPROBE)) { + /* eprobe and fprobe do not handle registers */ trace_probe_log_err(offs, BAD_VAR); break; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5df59714f9f5..8f4f23e8b234 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -367,6 +367,7 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a #define TPARG_FL_FENTRY BIT(2) #define TPARG_FL_TPOINT BIT(3) #define TPARG_FL_USER BIT(4) +#define TPARG_FL_FPROBE BIT(5) #define TPARG_FL_MASK GENMASK(4, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, @@ -409,7 +410,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ - C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ + C(BAD_MAXACT_TYPE, "Maxactive is only for function exit"), \ C(BAD_MAXACT, "Invalid maxactive number"), \ C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ -- cgit v1.2.3 From e2d0d7b2f42dcaf924e9c891c91c9aa22cbbebce Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:55 +0900 Subject: tracing/probes: Add tracepoint support on fprobe_events Allow fprobe_events to trace raw tracepoints so that user can trace tracepoints which don't have traceevent wrappers. This new event is always available if the fprobe_events is enabled (thus no kconfig), because the fprobe_events depends on the trace-event and traceporint. e.g. # echo 't sched_overutilized_tp' >> dynamic_events # echo 't 9p_client_req' >> dynamic_events # cat dynamic_events t:tracepoints/sched_overutilized_tp sched_overutilized_tp t:tracepoints/_9p_client_req 9p_client_req The event name is based on the tracepoint name, but if it is started with digit character, an underscore '_' will be added. NOTE: to avoid further confusion, this renames TPARG_FL_TPOINT to TPARG_FL_TEVENT because this flag is used for eprobe (trace-event probe). And reuse TPARG_FL_TPOINT for this raw tracepoint probe. Link: https://lore.kernel.org/all/168507471874.913472.17214624519622959593.stgit@mhiramat.roam.corp.google.com/ Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305020453.afTJ3VVp-lkp@intel.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 1 + kernel/trace/trace_eprobe.c | 2 +- kernel/trace/trace_fprobe.c | 134 +++++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace_probe.c | 15 +++-- kernel/trace/trace_probe.h | 15 ++++- 5 files changed, 151 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 755b0bf2e1ac..fa4e1a18da70 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5681,6 +5681,7 @@ static const char readme_msg[] = #endif #ifdef CONFIG_FPROBE_EVENTS "\t f[:[/][]] [%return] []\n" + "\t t[:[/][]] []\n" #endif #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 67e854979d53..fd64cd5d5745 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -817,7 +817,7 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT; + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; int ret; ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 48dbbc72b7dd..aa71ccb4205c 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "trace_dynevent.h" @@ -17,6 +18,7 @@ #include "trace_probe_tmpl.h" #define FPROBE_EVENT_SYSTEM "fprobes" +#define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 static int trace_fprobe_create(const char *raw_command); @@ -41,6 +43,8 @@ struct trace_fprobe { struct dyn_event devent; struct fprobe fp; const char *symbol; + struct tracepoint *tpoint; + struct module *mod; struct trace_probe tp; }; @@ -68,6 +72,11 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf) return tf->fp.exit_handler != NULL; } +static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) +{ + return tf->tpoint != NULL; +} + static const char *trace_fprobe_symbol(struct trace_fprobe *tf) { return tf->symbol ? tf->symbol : "unknown"; @@ -668,6 +677,21 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) else tf->fp.flags |= FPROBE_FL_DISABLED; + if (trace_fprobe_is_tracepoint(tf)) { + struct tracepoint *tpoint = tf->tpoint; + unsigned long ip = (unsigned long)tpoint->probestub; + /* + * Here, we do 2 steps to enable fprobe on a tracepoint. + * At first, put __probestub_##TP function on the tracepoint + * and put a fprobe on the stub function. + */ + ret = tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); + if (ret < 0) + return ret; + return register_fprobe_ips(&tf->fp, &ip, 1); + } + /* TODO: handle filter, nofilter or symbol list */ return register_fprobe(&tf->fp, tf->symbol, NULL); } @@ -678,6 +702,12 @@ static void __unregister_trace_fprobe(struct trace_fprobe *tf) if (trace_fprobe_is_registered(tf)) { unregister_fprobe(&tf->fp); memset(&tf->fp, 0, sizeof(tf->fp)); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } } } @@ -741,7 +771,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) { int ret; - if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to)) { + if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to) || + trace_fprobe_is_tracepoint(tf) != trace_fprobe_is_tracepoint(to)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); return -EEXIST; @@ -811,6 +842,60 @@ end: return ret; } +#ifdef CONFIG_MODULES +static int __tracepoint_probe_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ + struct tp_module *tp_mod = data; + struct trace_fprobe *tf; + struct dyn_event *pos; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + mutex_lock(&event_mutex); + for_each_trace_fprobe(tf, pos) { + if (tp_mod->mod == tf->mod) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } + } + mutex_unlock(&event_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; +#endif /* CONFIG_MODULES */ + +struct __find_tracepoint_cb_data { + const char *tp_name; + struct tracepoint *tpoint; +}; + +static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) + data->tpoint = tp; +} + +static struct tracepoint *find_tracepoint(const char *tp_name) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + }; + + for_each_kernel_tracepoint(__find_tracepoint_cb, &data); + + return data.tpoint; +} + static int __trace_fprobe_create(int argc, const char *argv[]) { /* @@ -819,6 +904,8 @@ static int __trace_fprobe_create(int argc, const char *argv[]) * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS] * - Add fexit probe: * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS] + * - Add tracepoint probe: + * t[:[GRP/][EVENT]] TRACEPOINT [FETCHARGS] * * Fetch args: * $retval : fetch return value @@ -844,10 +931,16 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + bool is_tracepoint = false; - if (argv[0][0] != 'f' || argc < 2) + if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; + if (argv[0][0] == 't') { + is_tracepoint = true; + group = TRACEPOINT_EVENT_SYSTEM; + } + trace_probe_log_init("trace_fprobe", argc, argv); event = strchr(&argv[0][1], ':'); @@ -881,14 +974,14 @@ static int __trace_fprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); - /* a symbol specified */ + /* a symbol(or tracepoint) must be specified */ symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) return -ENOMEM; tmp = strchr(symbol, '%'); if (tmp) { - if (!strcmp(tmp, "%return")) { + if (!is_tracepoint && !strcmp(tmp, "%return")) { *tmp = '\0'; is_return = true; } else { @@ -907,6 +1000,9 @@ static int __trace_fprobe_create(int argc, const char *argv[]) else flags |= TPARG_FL_FENTRY; + if (is_tracepoint) + flags |= TPARG_FL_TPOINT; + trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, @@ -917,8 +1013,11 @@ static int __trace_fprobe_create(int argc, const char *argv[]) if (!event) { /* Make a new event name */ - snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, - is_return ? "exit" : "entry"); + if (is_tracepoint) + strscpy(buf, symbol, MAX_EVENT_NAME_LEN); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + is_return ? "exit" : "entry"); sanitize_event_name(buf); event = buf; } @@ -932,6 +1031,18 @@ static int __trace_fprobe_create(int argc, const char *argv[]) WARN_ON_ONCE(ret != -ENOMEM); goto out; /* We know tf is not allocated */ } + + if (is_tracepoint) { + tf->tpoint = find_tracepoint(tf->symbol); + if (!tf->tpoint) { + trace_probe_log_set_index(1); + trace_probe_log_err(0, NO_TRACEPOINT); + goto parse_error; + } + tf->mod = __module_text_address( + (unsigned long)tf->tpoint->probestub); + } + argc -= 2; argv += 2; /* parse arguments */ @@ -991,7 +1102,10 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) struct trace_fprobe *tf = to_trace_fprobe(ev); int i; - seq_putc(m, 'f'); + if (trace_fprobe_is_tracepoint(tf)) + seq_putc(m, 't'); + else + seq_putc(m, 'f'); if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) seq_printf(m, "%d", tf->fp.nr_maxactive); seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), @@ -1048,6 +1162,12 @@ static __init int init_fprobe_trace_early(void) if (ret) return ret; +#ifdef CONFIG_MODULES + ret = register_tracepoint_module_notifier(&tracepoint_module_nb); + if (ret) + return ret; +#endif + return 0; } core_initcall(init_fprobe_trace_early); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index c39860fb2e41..798f18d24ebc 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -292,7 +292,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, int ret = 0; int len; - if (flags & TPARG_FL_TPOINT) { + if (flags & TPARG_FL_TEVENT) { if (code->data) return -EFAULT; code->data = kstrdup(arg, GFP_KERNEL); @@ -326,8 +326,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API - } else if (((flags & TPARG_FL_MASK) == - (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && + } else if (tparg_is_function_entry(flags) && (len = str_has_prefix(arg, "arg"))) { ret = kstrtoul(arg + len, 10, ¶m); if (ret) { @@ -338,6 +337,12 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; + /* + * The tracepoint probe will probe a stub function, and the + * first parameter of the stub is a dummy and should be ignored. + */ + if (flags & TPARG_FL_TPOINT) + code->param++; #endif } else goto inval_var; @@ -393,7 +398,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ - if (flags & (TPARG_FL_TPOINT | TPARG_FL_FPROBE)) { + if (flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { /* eprobe and fprobe do not handle registers */ trace_probe_log_err(offs, BAD_VAR); break; @@ -633,7 +638,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. But ignore for eprobes. */ - if (!(flags & TPARG_FL_TPOINT) && + if (!(flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8f4f23e8b234..e6b94fcdb886 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -359,16 +359,24 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a /* * The flags used for parsing trace_probe arguments. - * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TPOINT are mutually exclusive. + * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TEVENT are mutually exclusive. * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive. + * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with + * TPARG_FL_KERNEL. */ #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -#define TPARG_FL_TPOINT BIT(3) +#define TPARG_FL_TEVENT BIT(3) #define TPARG_FL_USER BIT(4) #define TPARG_FL_FPROBE BIT(5) -#define TPARG_FL_MASK GENMASK(4, 0) +#define TPARG_FL_TPOINT BIT(6) +#define TPARG_FL_LOC_MASK GENMASK(4, 0) + +static inline bool tparg_is_function_entry(unsigned int flags) +{ + return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); +} extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -415,6 +423,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_TRACEPOINT, "Tracepoint is not found"), \ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ -- cgit v1.2.3 From 1b8b0cd754cdbb54058165992456368495a695ac Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:56 +0900 Subject: tracing/probes: Move event parameter fetching code to common parser Move trace event parameter fetching code to common parser in trace_probe.c. This simplifies eprobe's trace-event variable fetching code by introducing a parse context data structure. Link: https://lore.kernel.org/all/168507472950.913472.2812253181558471278.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 44 +-------- kernel/trace/trace_fprobe.c | 4 +- kernel/trace/trace_kprobe.c | 4 +- kernel/trace/trace_probe.c | 218 ++++++++++++++++++++++++++------------------ kernel/trace/trace_probe.h | 9 +- kernel/trace/trace_uprobe.c | 8 +- 6 files changed, 155 insertions(+), 132 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index fd64cd5d5745..cb0077ba2b49 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -227,37 +227,6 @@ error: return ERR_PTR(ret); } -static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) -{ - struct probe_arg *parg = &ep->tp.args[i]; - struct ftrace_event_field *field; - struct list_head *head; - int ret = -ENOENT; - - head = trace_get_fields(ep->event); - list_for_each_entry(field, head, link) { - if (!strcmp(parg->code->data, field->name)) { - kfree(parg->code->data); - parg->code->data = field; - return 0; - } - } - - /* - * Argument not found on event. But allow for comm and COMM - * to be used to get the current->comm. - */ - if (strcmp(parg->code->data, "COMM") == 0 || - strcmp(parg->code->data, "comm") == 0) { - parg->code->op = FETCH_OP_COMM; - ret = 0; - } - - kfree(parg->code->data); - parg->code->data = NULL; - return ret; -} - static int eprobe_event_define_fields(struct trace_event_call *event_call) { struct eprobe_trace_entry_head field; @@ -817,19 +786,16 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + struct traceprobe_parse_context ctx = { + .event = ep->event, + .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, + }; int ret; - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); if (ret) return ret; - if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) { - ret = trace_eprobe_tp_arg_update(ep, i); - if (ret) - trace_probe_log_err(0, BAD_ATTACH_ARG); - } - /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index aa71ccb4205c..7d144e4a3fb6 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1047,8 +1047,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { .flags = flags }; + trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], flags); + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b7a4409674b3..1a3497719ada 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -867,8 +867,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { .flags = flags }; + trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 798f18d24ebc..9ebefacb6372 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -283,74 +283,114 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return 0; } +static int parse_trace_event_arg(char *arg, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ctx->event); + list_for_each_entry(field, head, link) { + if (!strcmp(arg, field->name)) { + code->op = FETCH_OP_TP_ARG; + code->data = field; + return 0; + } + } + return -ENOENT; +} + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, unsigned int flags, int offs) + struct fetch_insn *code, + struct traceprobe_parse_context *ctx) { unsigned long param; + int err = TP_ERR_BAD_VAR; int ret = 0; int len; - if (flags & TPARG_FL_TEVENT) { + if (ctx->flags & TPARG_FL_TEVENT) { if (code->data) return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; - } else if (strcmp(arg, "retval") == 0) { - if (flags & TPARG_FL_RETURN) { + ret = parse_trace_event_arg(arg, code, ctx); + if (!ret) + return 0; + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } + /* backward compatibility */ + ctx->offset = 0; + goto inval; + } + + if (strcmp(arg, "retval") == 0) { + if (ctx->flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; - } else { - trace_probe_log_err(offs, RETVAL_ON_PROBE); - ret = -EINVAL; + return 0; } - } else if ((len = str_has_prefix(arg, "stack"))) { + err = TP_ERR_RETVAL_ON_PROBE; + goto inval; + } + + len = str_has_prefix(arg, "stack"); + if (len) { + if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; - } else if (isdigit(arg[len])) { + return 0; + } + + if (isdigit(arg[len])) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret) { - goto inval_var; - } else if ((flags & TPARG_FL_KERNEL) && - param > PARAM_MAX_STACK) { - trace_probe_log_err(offs, BAD_STACK_NUM); - ret = -EINVAL; - } else { - code->op = FETCH_OP_STACK; - code->param = (unsigned int)param; + if (ret) + goto inval; + + if ((ctx->flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_STACK_NUM; + goto inval; } - } else - goto inval_var; - } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; + return 0; + } + goto inval; + } + + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; + return 0; + } + #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API - } else if (tparg_is_function_entry(flags) && - (len = str_has_prefix(arg, "arg"))) { + len = str_has_prefix(arg, "arg"); + if (len && tparg_is_function_entry(ctx->flags)) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret) { - goto inval_var; - } else if (!param || param > PARAM_MAX_STACK) { - trace_probe_log_err(offs, BAD_ARG_NUM); - return -EINVAL; + if (ret) + goto inval; + + if (!param || param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_ARG_NUM; + goto inval; } + code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; /* * The tracepoint probe will probe a stub function, and the * first parameter of the stub is a dummy and should be ignored. */ - if (flags & TPARG_FL_TPOINT) + if (ctx->flags & TPARG_FL_TPOINT) code->param++; + return 0; + } #endif - } else - goto inval_var; - return ret; - -inval_var: - trace_probe_log_err(offs, BAD_VAR); +inval: + __trace_probe_log_err(ctx->offset, err); return -EINVAL; } @@ -383,7 +423,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - unsigned int flags, int offs) + struct traceprobe_parse_context *ctx) { struct fetch_insn *code = *pcode; unsigned long param; @@ -394,13 +434,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, flags, offs); + ret = parse_probe_vars(arg + 1, type, code, ctx); break; case '%': /* named register */ - if (flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { + if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { /* eprobe and fprobe do not handle registers */ - trace_probe_log_err(offs, BAD_VAR); + trace_probe_log_err(ctx->offset, BAD_VAR); break; } ret = regs_query_register_offset(arg + 1); @@ -409,14 +449,14 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->param = (unsigned int)ret; ret = 0; } else - trace_probe_log_err(offs, BAD_REG_NAME); + trace_probe_log_err(ctx->offset, BAD_REG_NAME); break; case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); if (ret) { - trace_probe_log_err(offs, BAD_MEM_ADDR); + trace_probe_log_err(ctx->offset, BAD_MEM_ADDR); break; } /* load address */ @@ -424,13 +464,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (flags & TPARG_FL_KERNEL) { - trace_probe_log_err(offs, FILE_ON_KPROBE); + if (ctx->flags & TPARG_FL_KERNEL) { + trace_probe_log_err(ctx->offset, FILE_ON_KPROBE); return -EINVAL; } ret = kstrtol(arg + 2, 0, &offset); if (ret) { - trace_probe_log_err(offs, BAD_FILE_OFFS); + trace_probe_log_err(ctx->offset, BAD_FILE_OFFS); break; } @@ -438,8 +478,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!(flags & TPARG_FL_KERNEL)) { - trace_probe_log_err(offs, SYM_ON_UPROBE); + if (!(ctx->flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(ctx->offset, SYM_ON_UPROBE); return -EINVAL; } /* Preserve symbol for updating */ @@ -448,7 +488,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, if (!code->data) return -ENOMEM; if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } code->op = FETCH_OP_IMM; @@ -456,7 +496,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } /* These are fetching from memory */ if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } *pcode = code; @@ -475,36 +515,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type, arg++; /* Skip '+', because kstrtol() rejects it. */ tmp = strchr(arg, '('); if (!tmp) { - trace_probe_log_err(offs, DEREF_NEED_BRACE); + trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE); return -EINVAL; } *tmp = '\0'; ret = kstrtol(arg, 0, &offset); if (ret) { - trace_probe_log_err(offs, BAD_DEREF_OFFS); + trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS); break; } - offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); + ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); arg = tmp + 1; tmp = strrchr(arg, ')'); if (!tmp) { - trace_probe_log_err(offs + strlen(arg), + trace_probe_log_err(ctx->offset + strlen(arg), DEREF_OPEN_BRACE); return -EINVAL; } else { - const struct fetch_type *t2 = find_fetch_type(NULL, flags); + const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags); + int cur_offs = ctx->offset; *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, flags, offs); + ret = parse_probe_arg(arg, t2, &code, end, ctx); if (ret) break; + ctx->offset = cur_offs; if (code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) { - trace_probe_log_err(offs, COMM_CANT_DEREF); + trace_probe_log_err(ctx->offset, COMM_CANT_DEREF); return -EINVAL; } if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } *pcode = code; @@ -515,7 +557,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '\\': /* Immediate value */ if (arg[1] == '"') { /* Immediate string */ - ret = __parse_imm_string(arg + 2, &tmp, offs + 2); + ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2); if (ret) break; code->op = FETCH_OP_DATA; @@ -523,7 +565,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } else { ret = str_to_immediate(arg + 1, &code->immediate); if (ret) - trace_probe_log_err(offs + 1, BAD_IMM); + trace_probe_log_err(ctx->offset + 1, BAD_IMM); else code->op = FETCH_OP_IMM; } @@ -531,7 +573,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ - trace_probe_log_err(offs, BAD_FETCH_ARG); + trace_probe_log_err(ctx->offset, BAD_FETCH_ARG); ret = -EINVAL; } return ret; @@ -576,12 +618,13 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, - struct probe_arg *parg, unsigned int flags, int offset) + struct probe_arg *parg, + struct traceprobe_parse_context *ctx) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2, *t3; - char *arg; int ret, len; + char *arg; arg = kstrdup(argv, GFP_KERNEL); if (!arg) @@ -590,10 +633,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, ret = -EINVAL; len = strlen(arg); if (len > MAX_ARGSTR_LEN) { - trace_probe_log_err(offset, ARG_TOO_LONG); + trace_probe_log_err(ctx->offset, ARG_TOO_LONG); goto out; } else if (len == 0) { - trace_probe_log_err(offset, NO_ARG_BODY); + trace_probe_log_err(ctx->offset, NO_ARG_BODY); goto out; } @@ -611,23 +654,24 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, *t2++ = '\0'; t3 = strchr(t2, ']'); if (!t3) { - offset += t2 + strlen(t2) - arg; - trace_probe_log_err(offset, + int offs = t2 + strlen(t2) - arg; + + trace_probe_log_err(ctx->offset + offs, ARRAY_NO_CLOSE); goto out; } else if (t3[1] != '\0') { - trace_probe_log_err(offset + t3 + 1 - arg, + trace_probe_log_err(ctx->offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); goto out; } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { - trace_probe_log_err(offset + t2 - arg, + trace_probe_log_err(ctx->offset + t2 - arg, BAD_ARRAY_NUM); goto out; } if (parg->count > MAX_ARRAY_LEN) { - trace_probe_log_err(offset + t2 - arg, + trace_probe_log_err(ctx->offset + t2 - arg, ARRAY_TOO_BIG); goto out; } @@ -638,17 +682,17 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. But ignore for eprobes. */ - if (!(flags & TPARG_FL_TEVENT) && + if (!(ctx->flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; - parg->type = find_fetch_type("string", flags); + parg->type = find_fetch_type("string", ctx->flags); } else - parg->type = find_fetch_type(t, flags); + parg->type = find_fetch_type(t, ctx->flags); if (!parg->type) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; } parg->offset = *size; @@ -670,7 +714,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - flags, offset); + ctx); if (ret) goto fail; @@ -681,7 +725,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_SYMSTRING); goto fail; } @@ -689,7 +733,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } @@ -708,7 +752,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, */ code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } } @@ -731,7 +775,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, } else { code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } code->op = FETCH_OP_ST_RAW; @@ -742,7 +786,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); if (ret) { - trace_probe_log_err(offset + t - arg, BAD_BITFIELD); + trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); goto fail; } } @@ -752,13 +796,13 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING && scode->op != FETCH_OP_ST_USTRING) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -807,7 +851,7 @@ static int traceprobe_conflict_field_name(const char *name, } int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, - unsigned int flags) + struct traceprobe_parse_context *ctx) { struct probe_arg *parg = &tp->args[i]; const char *body; @@ -842,9 +886,9 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, trace_probe_log_err(0, USED_ARG_NAME); return -EINVAL; } + ctx->offset = body - arg; /* Parse fetch argument */ - return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, - body - arg); + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, ctx); } void traceprobe_free_probe_arg(struct probe_arg *arg) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e6b94fcdb886..f622340ae171 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -378,8 +378,15 @@ static inline bool tparg_is_function_entry(unsigned int flags) return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); } +struct traceprobe_parse_context { + struct trace_event_call *event; + unsigned int flags; + int offset; +}; + extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, - const char *argv, unsigned int flags); + const char *argv, + struct traceprobe_parse_context *ctx); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b92e34ff0c8..fa09b33ee731 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -686,10 +686,12 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { + .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, + }; + trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], - (is_return ? TPARG_FL_RETURN : 0) | - TPARG_FL_USER); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); if (ret) goto error; } -- cgit v1.2.3 From b576e09701c7d045bbe5cd85d53e2f34426aa214 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:56 +0900 Subject: tracing/probes: Support function parameters if BTF is available Support function or tracepoint parameters by name if BTF support is enabled and the event is for function entry (this feature can be used with kprobe- events, fprobe-events and tracepoint probe events.) Note that the BTF variable syntax does not require a prefix. If it starts with an alphabetic character or an underscore ('_') without a prefix like '$' and '%', it is considered as a BTF variable. If you specify only the BTF variable name, the argument name will also be the same name instead of 'arg*'. # echo 'p vfs_read count pos' >> dynamic_events # echo 'f vfs_write count pos' >> dynamic_events # echo 't sched_overutilized_tp rd overutilized' >> dynamic_events # cat dynamic_events p:kprobes/p_vfs_read_0 vfs_read count=count pos=pos f:fprobes/vfs_write__entry vfs_write count=count pos=pos t:tracepoints/sched_overutilized_tp sched_overutilized_tp rd=rd overutilized=overutilized Link: https://lore.kernel.org/all/168507474014.913472.16963996883278039183.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Alan Maguire Tested-by: Alan Maguire --- kernel/trace/Kconfig | 12 +++ kernel/trace/trace.c | 4 + kernel/trace/trace_fprobe.c | 53 ++++++----- kernel/trace/trace_kprobe.c | 12 +-- kernel/trace/trace_probe.c | 211 +++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace_probe.h | 9 +- 6 files changed, 270 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8e10a9453c96..b3f90d602896 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -664,6 +664,18 @@ config FPROBE_EVENTS and the kprobe events on function entry and exit will be transparently converted to this fprobe events. +config PROBE_EVENTS_BTF_ARGS + depends on HAVE_FUNCTION_ARG_ACCESS_API + depends on FPROBE_EVENTS || KPROBE_EVENTS + depends on DEBUG_INFO_BTF && BPF_SYSCALL + bool "Support BTF function arguments for probe events" + default y + help + The user can specify the arguments of the probe event using the names + of the arguments of the probed function, when the probe location is a + kernel function entry or a tracepoint. + This is available only if BTF (BPF Type Format) support is enabled. + config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa4e1a18da70..a70b22235eaf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5698,7 +5698,11 @@ static const char readme_msg[] = "\t args: =fetcharg[:type]\n" "\t fetcharg: (%|$), @
, @[+|-],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + "\t $stack, $stack, $retval, $comm, $arg, \n" +#else "\t $stack, $stack, $retval, $comm, $arg,\n" +#endif #else "\t $stack, $stack, $retval, $comm,\n" #endif diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 7d144e4a3fb6..2dd884609321 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -366,6 +366,7 @@ static void free_trace_fprobe(struct trace_fprobe *tf) static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, + struct tracepoint *tpoint, int maxactive, int nargs, bool is_return) { @@ -385,6 +386,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, else tf->fp.entry_handler = fentry_dispatcher; + tf->tpoint = tpoint; tf->fp.nr_maxactive = maxactive; ret = trace_probe_init(&tf->tp, event, group, false); @@ -930,8 +932,12 @@ static int __trace_fprobe_create(int argc, const char *argv[]) int maxactive = 0; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + char sbuf[KSYM_NAME_LEN]; bool is_tracepoint = false; + struct tracepoint *tpoint = NULL; + struct traceprobe_parse_context ctx = { + .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + }; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -995,14 +1001,6 @@ static int __trace_fprobe_create(int argc, const char *argv[]) goto parse_error; } - if (is_return) - flags |= TPARG_FL_RETURN; - else - flags |= TPARG_FL_FENTRY; - - if (is_tracepoint) - flags |= TPARG_FL_TPOINT; - trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, @@ -1014,7 +1012,8 @@ static int __trace_fprobe_create(int argc, const char *argv[]) if (!event) { /* Make a new event name */ if (is_tracepoint) - strscpy(buf, symbol, MAX_EVENT_NAME_LEN); + snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + isdigit(*symbol) ? "_" : "", symbol); else snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, is_return ? "exit" : "entry"); @@ -1022,8 +1021,27 @@ static int __trace_fprobe_create(int argc, const char *argv[]) event = buf; } + if (is_return) + ctx.flags |= TPARG_FL_RETURN; + else + ctx.flags |= TPARG_FL_FENTRY; + + if (is_tracepoint) { + ctx.flags |= TPARG_FL_TPOINT; + tpoint = find_tracepoint(symbol); + if (!tpoint) { + trace_probe_log_set_index(1); + trace_probe_log_err(0, NO_TRACEPOINT); + goto parse_error; + } + ctx.funcname = kallsyms_lookup( + (unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); + } else + ctx.funcname = symbol; + /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, maxactive, + tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, argc - 2, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); @@ -1032,24 +1050,15 @@ static int __trace_fprobe_create(int argc, const char *argv[]) goto out; /* We know tf is not allocated */ } - if (is_tracepoint) { - tf->tpoint = find_tracepoint(tf->symbol); - if (!tf->tpoint) { - trace_probe_log_set_index(1); - trace_probe_log_err(0, NO_TRACEPOINT); - goto parse_error; - } + if (is_tracepoint) tf->mod = __module_text_address( (unsigned long)tf->tpoint->probestub); - } argc -= 2; argv += 2; - /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - struct traceprobe_parse_context ctx = { .flags = flags }; - trace_probe_log_set_index(i + 2); + ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1a3497719ada..7cc32da3e8e8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -742,7 +742,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - unsigned int flags = TPARG_FL_KERNEL; + struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { case 'r': @@ -823,10 +823,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) goto parse_error; } if (is_return) - flags |= TPARG_FL_RETURN; + ctx.flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); if (ret == 0 && !is_return) - flags |= TPARG_FL_FENTRY; + ctx.flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); @@ -856,7 +856,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc - 2, is_return); + argc - 2, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ @@ -866,10 +866,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argc -= 2; argv += 2; /* parse arguments */ + ctx.funcname = symbol; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - struct traceprobe_parse_context ctx = { .flags = flags }; - trace_probe_log_set_index(i + 2); + ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9ebefacb6372..08c18d9d4cf2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -11,6 +11,8 @@ */ #define pr_fmt(fmt) "trace_probe: " fmt +#include + #include "trace_probe.h" #undef C @@ -300,6 +302,171 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code, return -ENOENT; } +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + +static struct btf *traceprobe_get_btf(void) +{ + struct btf *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(btf)) + return NULL; + + return btf; +} + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static const char *type_from_btf_id(struct btf *btf, s32 id) +{ + const struct btf_type *t; + u32 intdata; + s32 tid; + + /* TODO: const char * could be converted as a string */ + t = btf_type_skip_modifiers(btf, id, &tid); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_ENUM: + /* enum is "int", so convert to "s32" */ + return "s32"; + case BTF_KIND_ENUM64: + return "s64"; + case BTF_KIND_PTR: + /* pointer will be converted to "x??" */ + if (IS_ENABLED(CONFIG_64BIT)) + return "x64"; + else + return "x32"; + case BTF_KIND_INT: + intdata = btf_type_int(t); + if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) { + switch (BTF_INT_BITS(intdata)) { + case 8: + return "s8"; + case 16: + return "s16"; + case 32: + return "s32"; + case 64: + return "s64"; + } + } else { /* unsigned */ + switch (BTF_INT_BITS(intdata)) { + case 8: + return "u8"; + case 16: + return "u16"; + case 32: + return "u32"; + case 64: + return "u64"; + } + } + } + /* TODO: support other types */ + + return NULL; +} + +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr) +{ + struct btf *btf = traceprobe_get_btf(); + const struct btf_type *t; + s32 id; + + if (!btf || !funcname || !nr) + return ERR_PTR(-EINVAL); + + id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC); + if (id <= 0) + return ERR_PTR(-ENOENT); + + /* Get BTF_KIND_FUNC type */ + t = btf_type_by_id(btf, id); + if (!btf_type_is_func(t)) + return ERR_PTR(-ENOENT); + + /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return ERR_PTR(-ENOENT); + + *nr = btf_type_vlen(t); + + if (*nr) + return (const struct btf_param *)(t + 1); + else + return NULL; +} + +static int parse_btf_arg(const char *varname, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const struct btf_param *params; + int i; + + if (!btf) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; + } + + if (WARN_ON_ONCE(!ctx->funcname)) + return -EINVAL; + + if (!ctx->params) { + params = find_btf_func_param(ctx->funcname, &ctx->nr_params); + if (IS_ERR(params)) { + trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); + return PTR_ERR(params); + } + ctx->params = params; + } else + params = ctx->params; + + for (i = 0; i < ctx->nr_params; i++) { + const char *name = btf_name_by_offset(btf, params[i].name_off); + + if (name && !strcmp(name, varname)) { + code->op = FETCH_OP_ARG; + code->param = i; + return 0; + } + } + trace_probe_log_err(ctx->offset, NO_BTFARG); + return -ENOENT; +} + +static const struct fetch_type *parse_btf_arg_type(int arg_idx, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *typestr = NULL; + + if (btf && ctx->params) + typestr = type_from_btf_id(btf, ctx->params[arg_idx].type); + + return find_fetch_type(typestr, ctx->flags); +} +#else +static struct btf *traceprobe_get_btf(void) +{ + return NULL; +} + +static int parse_btf_arg(const char *varname, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} +#define parse_btf_arg_type(idx, ctx) \ + find_fetch_type(NULL, ctx->flags) +#endif + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, @@ -570,6 +737,15 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_IMM; } break; + default: + if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EINVAL; + } + ret = parse_btf_arg(arg, code, ctx); + break; + } } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ @@ -718,6 +894,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (ret) goto fail; + /* Update storing type if BTF is available */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + !t && code->op == FETCH_OP_ARG) + parg->type = parse_btf_arg_type(code->param, ctx); + ret = -EINVAL; /* Store operation */ if (parg->type->is_string) { @@ -850,6 +1031,33 @@ static int traceprobe_conflict_field_name(const char *name, return 0; } +static char *generate_probe_arg_name(const char *arg, int idx) +{ + char *name = NULL; + const char *end; + + /* + * If argument name is omitted, try arg as a name (BTF variable) + * or "argN". + */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) { + end = strchr(arg, ':'); + if (!end) + end = arg + strlen(arg); + + name = kmemdup_nul(arg, end - arg, GFP_KERNEL); + if (!name || !is_good_name(name)) { + kfree(name); + name = NULL; + } + } + + if (!name) + name = kasprintf(GFP_KERNEL, "arg%d", idx + 1); + + return name; +} + int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, struct traceprobe_parse_context *ctx) { @@ -871,8 +1079,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { - /* If argument name is omitted, set "argN" */ - parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1); + parg->name = generate_probe_arg_name(arg, i); body = arg; } if (!parg->name) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f622340ae171..7af121996ce9 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "trace.h" @@ -380,6 +381,9 @@ static inline bool tparg_is_function_entry(unsigned int flags) struct traceprobe_parse_context { struct trace_event_call *event; + const struct btf_param *params; + s32 nr_params; + const char *funcname; unsigned int flags; int offset; }; @@ -478,7 +482,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ - C(NO_EP_FILTER, "No filter rule after 'if'"), + C(NO_EP_FILTER, "No filter rule after 'if'"), \ + C(NOSUP_BTFARG, "BTF is not available or not supported"), \ + C(NO_BTFARG, "This variable is not found at this probe point"),\ + C(NO_BTF_ENTRY, "No BTF entry for this probe point"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From 18b1e870a49671745c31434b18bcfdd6f20cb6a1 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:56 +0900 Subject: tracing/probes: Add $arg* meta argument for all function args Add the '$arg*' meta fetch argument for function-entry probe events. This will be expanded to the all arguments of the function and the tracepoint using BTF function argument information. e.g. # echo 'p vfs_read $arg*' >> dynamic_events # echo 'f vfs_write $arg*' >> dynamic_events # echo 't sched_overutilized_tp $arg*' >> dynamic_events # cat dynamic_events p:kprobes/p_vfs_read_0 vfs_read file=file buf=buf count=count pos=pos f:fprobes/vfs_write__entry vfs_write file=file buf=buf count=count pos=pos t:tracepoints/sched_overutilized_tp sched_overutilized_tp rd=rd overutilized=overutilized Also, single '$arg[0-9]*' will be converted to the BTF function argument. NOTE: This seems like a wildcard, but a fake one at this moment. This is just for telling user that this can be expanded to several arguments. And it is not like other $-vars, you can not use this $arg* as a part of fetch args, e.g. specifying name "foo=$arg*" and using it in dereferences "+0($arg*)" will lead a parse error. Link: https://lore.kernel.org/all/168507475126.913472.18329684401466211816.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_fprobe.c | 21 +++++- kernel/trace/trace_kprobe.c | 23 ++++-- kernel/trace/trace_probe.c | 169 ++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace_probe.h | 11 ++- 4 files changed, 212 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 2dd884609321..dfe2e546acdc 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -925,14 +925,16 @@ static int __trace_fprobe_create(int argc, const char *argv[]) * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_fprobe *tf = NULL; - int i, len, ret = 0; + int i, len, new_argc = 0, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + const char **new_argv = NULL; int maxactive = 0; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; + char abuf[MAX_BTF_ARGS_LEN]; bool is_tracepoint = false; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { @@ -1040,9 +1042,22 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } else ctx.funcname = symbol; + argc -= 2; argv += 2; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, - argc - 2, is_return); + argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ @@ -1054,7 +1069,6 @@ static int __trace_fprobe_create(int argc, const char *argv[]) tf->mod = __module_text_address( (unsigned long)tf->tpoint->probestub); - argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); @@ -1083,6 +1097,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) out: trace_probe_log_clear(); + kfree(new_argv); kfree(symbol); return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7cc32da3e8e8..74adb82331dd 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -732,9 +732,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk = NULL; - int i, len, ret = 0; + int i, len, new_argc = 0, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; + const char **new_argv = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; enum probe_print_type ptype; int maxactive = 0; @@ -742,6 +743,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; + char abuf[MAX_BTF_ARGS_LEN]; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { @@ -854,19 +856,31 @@ static int __trace_kprobe_create(int argc, const char *argv[]) event = buf; } + argc -= 2; argv += 2; + ctx.funcname = symbol; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc - 2, is_return); + argc, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); goto out; /* We know tk is not allocated */ } - argc -= 2; argv += 2; /* parse arguments */ - ctx.funcname = symbol; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); ctx.offset = 0; @@ -894,6 +908,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) out: trace_probe_log_clear(); + kfree(new_argv); kfree(symbol); return ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 08c18d9d4cf2..7216435d6728 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -371,9 +371,11 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) return NULL; } -static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr) +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) { struct btf *btf = traceprobe_get_btf(); + const struct btf_param *param; const struct btf_type *t; s32 id; @@ -395,9 +397,16 @@ static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr return ERR_PTR(-ENOENT); *nr = btf_type_vlen(t); + param = (const struct btf_param *)(t + 1); - if (*nr) - return (const struct btf_param *)(t + 1); + /* Hide the first 'data' argument of tracepoint */ + if (tracepoint) { + (*nr)--; + param++; + } + + if (*nr > 0) + return param; else return NULL; } @@ -418,7 +427,8 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, return -EINVAL; if (!ctx->params) { - params = find_btf_func_param(ctx->funcname, &ctx->nr_params); + params = find_btf_func_param(ctx->funcname, &ctx->nr_params, + ctx->flags & TPARG_FL_TPOINT); if (IS_ERR(params)) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); return PTR_ERR(params); @@ -451,12 +461,19 @@ static const struct fetch_type *parse_btf_arg_type(int arg_idx, return find_fetch_type(typestr, ctx->flags); } + #else static struct btf *traceprobe_get_btf(void) { return NULL; } +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static int parse_btf_arg(const char *varname, struct fetch_insn *code, struct traceprobe_parse_context *ctx) { @@ -1114,6 +1131,150 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) kfree(arg->fmt); } +static int argv_has_var_arg(int argc, const char *argv[], int *args_idx, + struct traceprobe_parse_context *ctx) +{ + int i, found = 0; + + for (i = 0; i < argc; i++) + if (str_has_prefix(argv[i], "$arg")) { + trace_probe_log_set_index(i + 2); + + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(0, NOFENTRY_ARGS); + return -EINVAL; + } + + if (isdigit(argv[i][4])) { + found = 1; + continue; + } + + if (argv[i][4] != '*') { + trace_probe_log_err(0, BAD_VAR); + return -EINVAL; + } + + if (*args_idx >= 0 && *args_idx < argc) { + trace_probe_log_err(0, DOUBLE_ARGS); + return -EINVAL; + } + found = 1; + *args_idx = i; + } + + return found; +} + +static int sprint_nth_btf_arg(int idx, const char *type, + char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *name; + int ret; + + if (idx >= ctx->nr_params) { + trace_probe_log_err(0, NO_BTFARG); + return -ENOENT; + } + name = btf_name_by_offset(btf, ctx->params[idx].name_off); + if (!name) { + trace_probe_log_err(0, NO_BTF_ENTRY); + return -ENOENT; + } + ret = snprintf(buf, bufsize, "%s%s", name, type); + if (ret >= bufsize) { + trace_probe_log_err(0, ARGS_2LONG); + return -E2BIG; + } + return ret; +} + +/* Return new_argv which must be freed after use */ +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + const struct btf_param *params = NULL; + int i, j, n, used, ret, args_idx = -1; + const char **new_argv = NULL; + int nr_params; + + ret = argv_has_var_arg(argc, argv, &args_idx, ctx); + if (ret < 0) + return ERR_PTR(ret); + + if (!ret) { + *new_argc = argc; + return NULL; + } + + params = find_btf_func_param(ctx->funcname, &nr_params, + ctx->flags & TPARG_FL_TPOINT); + if (IS_ERR(params)) { + if (args_idx != -1) { + /* $arg* requires BTF info */ + trace_probe_log_err(0, NOSUP_BTFARG); + return (const char **)params; + } + return 0; + } + ctx->params = params; + ctx->nr_params = nr_params; + + if (args_idx >= 0) + *new_argc = argc + ctx->nr_params - 1; + else + *new_argc = argc; + + new_argv = kcalloc(*new_argc, sizeof(char *), GFP_KERNEL); + if (!new_argv) + return ERR_PTR(-ENOMEM); + + used = 0; + for (i = 0, j = 0; i < argc; i++) { + trace_probe_log_set_index(i + 2); + if (i == args_idx) { + for (n = 0; n < nr_params; n++) { + ret = sprint_nth_btf_arg(n, "", buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + + new_argv[j++] = buf + used; + used += ret + 1; + } + continue; + } + + if (str_has_prefix(argv[i], "$arg")) { + char *type = NULL; + + n = simple_strtoul(argv[i] + 4, &type, 10); + if (type && !(*type == ':' || *type == '\0')) { + trace_probe_log_err(0, BAD_VAR); + ret = -ENOENT; + goto error; + } + /* Note: $argN starts from $arg1 */ + ret = sprint_nth_btf_arg(n - 1, type, buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + new_argv[j++] = buf + used; + used += ret + 1; + } else + new_argv[j++] = argv[i]; + } + + return new_argv; + +error: + kfree(new_argv); + return ERR_PTR(ret); +} + int traceprobe_update_arg(struct probe_arg *arg) { struct fetch_insn *code = arg->code; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 7af121996ce9..e7fa2f2ed01c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -33,7 +33,9 @@ #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 +#define MAX_BTF_ARGS_LEN 128 #define MAX_STRING_SIZE PATH_MAX +#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -391,6 +393,9 @@ struct traceprobe_parse_context { extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, struct traceprobe_parse_context *ctx); +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); @@ -485,7 +490,11 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EP_FILTER, "No filter rule after 'if'"), \ C(NOSUP_BTFARG, "BTF is not available or not supported"), \ C(NO_BTFARG, "This variable is not found at this probe point"),\ - C(NO_BTF_ENTRY, "No BTF entry for this probe point"), + C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \ + C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ + C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ + C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ + C(ARGS_2LONG, "$arg* failed because the argument list is too long"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From fd26290ec89d4eae8570e027df3b8c519d285fd0 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Jun 2023 21:39:56 +0900 Subject: tracing/probes: Add BTF retval type support Check the target function has non-void retval type and set the correct fetch type if user doesn't specify it. If the function returns void, $retval is rejected as below; # echo 'f unregister_kprobes%return $retval' >> dynamic_events sh: write error: No such file or directory # cat error_log [ 37.488397] trace_fprobe: error: This function returns 'void' type Command: f unregister_kprobes%return $retval ^ Link: https://lore.kernel.org/all/168507476195.913472.16290308831790216609.stgit@mhiramat.roam.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 69 +++++++++++++++++++++++++++++++++++++++++----- kernel/trace/trace_probe.h | 1 + 2 files changed, 63 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 7216435d6728..ba1c6e059b51 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -371,15 +371,13 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) return NULL; } -static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, - bool tracepoint) +static const struct btf_type *find_btf_func_proto(const char *funcname) { struct btf *btf = traceprobe_get_btf(); - const struct btf_param *param; const struct btf_type *t; s32 id; - if (!btf || !funcname || !nr) + if (!btf || !funcname) return ERR_PTR(-EINVAL); id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC); @@ -396,6 +394,22 @@ static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr if (!btf_type_is_func_proto(t)) return ERR_PTR(-ENOENT); + return t; +} + +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) +{ + const struct btf_param *param; + const struct btf_type *t; + + if (!funcname || !nr) + return ERR_PTR(-EINVAL); + + t = find_btf_func_proto(funcname); + if (IS_ERR(t)) + return (const struct btf_param *)t; + *nr = btf_type_vlen(t); param = (const struct btf_param *)(t + 1); @@ -462,6 +476,32 @@ static const struct fetch_type *parse_btf_arg_type(int arg_idx, return find_fetch_type(typestr, ctx->flags); } +static const struct fetch_type *parse_btf_retval_type( + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *typestr = NULL; + const struct btf_type *t; + + if (btf && ctx->funcname) { + t = find_btf_func_proto(ctx->funcname); + if (!IS_ERR(t)) + typestr = type_from_btf_id(btf, t->type); + } + + return find_fetch_type(typestr, ctx->flags); +} + +static bool is_btf_retval_void(const char *funcname) +{ + const struct btf_type *t; + + t = find_btf_func_proto(funcname); + if (IS_ERR(t)) + return false; + + return t->type == 0; +} #else static struct btf *traceprobe_get_btf(void) { @@ -480,8 +520,15 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, trace_probe_log_err(ctx->offset, NOSUP_BTFARG); return -EOPNOTSUPP; } + #define parse_btf_arg_type(idx, ctx) \ find_fetch_type(NULL, ctx->flags) + +#define parse_btf_retval_type(ctx) \ + find_fetch_type(NULL, ctx->flags) + +#define is_btf_retval_void(funcname) (false) + #endif #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) @@ -512,6 +559,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (strcmp(arg, "retval") == 0) { if (ctx->flags & TPARG_FL_RETURN) { + if ((ctx->flags & TPARG_FL_KERNEL) && + is_btf_retval_void(ctx->funcname)) { + err = TP_ERR_NO_RETVAL; + goto inval; + } code->op = FETCH_OP_RETVAL; return 0; } @@ -912,9 +964,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, goto fail; /* Update storing type if BTF is available */ - if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && - !t && code->op == FETCH_OP_ARG) - parg->type = parse_btf_arg_type(code->param, ctx); + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && !t) { + if (code->op == FETCH_OP_ARG) + parg->type = parse_btf_arg_type(code->param, ctx); + else if (code->op == FETCH_OP_RETVAL) + parg->type = parse_btf_retval_type(ctx); + } ret = -EINVAL; /* Store operation */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e7fa2f2ed01c..01ea148723de 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -449,6 +449,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(NO_RETVAL, "This function returns 'void' type"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ C(BAD_VAR, "Invalid $-valiable specified"), \ -- cgit v1.2.3 From aa7881fcfe9d328484265d589bc2785533e33c4d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 6 Jun 2023 11:53:08 +0800 Subject: bpf: Factor out a common helper free_all() Factor out a common helper free_all() to free all normal elements or per-cpu elements on a lock-less list. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230606035310.4026145-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 410637c225fb..0668bcd7c926 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -211,9 +211,9 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) mem_cgroup_put(memcg); } -static void free_one(struct bpf_mem_cache *c, void *obj) +static void free_one(void *obj, bool percpu) { - if (c->percpu_size) { + if (percpu) { free_percpu(((void **)obj)[1]); kfree(obj); return; @@ -222,14 +222,19 @@ static void free_one(struct bpf_mem_cache *c, void *obj) kfree(obj); } -static void __free_rcu(struct rcu_head *head) +static void free_all(struct llist_node *llnode, bool percpu) { - struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); - struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); struct llist_node *pos, *t; llist_for_each_safe(pos, t, llnode) - free_one(c, pos); + free_one(pos, percpu); +} + +static void __free_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + + free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); atomic_set(&c->call_rcu_in_progress, 0); } @@ -432,7 +437,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) static void drain_mem_cache(struct bpf_mem_cache *c) { - struct llist_node *llnode, *t; + bool percpu = !!c->percpu_size; /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in @@ -441,14 +446,10 @@ static void drain_mem_cache(struct bpf_mem_cache *c) * Except for waiting_for_gp list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra)) - free_one(c, llnode); + free_all(__llist_del_all(&c->free_by_rcu), percpu); + free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(__llist_del_all(&c->free_llist), percpu); + free_all(__llist_del_all(&c->free_llist_extra), percpu); } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) -- cgit v1.2.3 From 4d3af20eaf3fcd481a797738bb46634e37f4a1cc Mon Sep 17 00:00:00 2001 From: Sui Jingfeng Date: Sun, 4 Jun 2023 21:50:54 +0800 Subject: dma-mapping: fix a Kconfig typo 'thing' -> 'think' Signed-off-by: Sui Jingfeng Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 6677d0e64d27..fd35ed4c67b8 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -39,7 +39,7 @@ config ARCH_HAS_DMA_SET_MASK # # Select this option if the architecture needs special handling for # DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what -# people thing of when saying write combine, so very few platforms should +# people think of when saying write combine, so very few platforms should # need to enable this. # config ARCH_HAS_DMA_WRITE_COMBINE -- cgit v1.2.3 From 51ff97d54f02b4444dfc42e380ac4c058e12d5dd Mon Sep 17 00:00:00 2001 From: gaoxu Date: Tue, 6 Jun 2023 12:47:37 +0000 Subject: dma-remap: use kvmalloc_array/kvfree for larger dma memory remap If dma_direct_alloc() alloc memory in size of 64MB, the inner function dma_common_contiguous_remap() will allocate 128KB memory by invoking the function kmalloc_array(). and the kmalloc_array seems to fail to try to allocate 128KB mem. Call trace: [14977.928623] qcrosvm: page allocation failure: order:5, mode:0x40cc0 [14977.928638] dump_backtrace.cfi_jt+0x0/0x8 [14977.928647] dump_stack_lvl+0x80/0xb8 [14977.928652] warn_alloc+0x164/0x200 [14977.928657] __alloc_pages_slowpath+0x9f0/0xb4c [14977.928660] __alloc_pages+0x21c/0x39c [14977.928662] kmalloc_order+0x48/0x108 [14977.928666] kmalloc_order_trace+0x34/0x154 [14977.928668] __kmalloc+0x548/0x7e4 [14977.928673] dma_direct_alloc+0x11c/0x4f8 [14977.928678] dma_alloc_attrs+0xf4/0x138 [14977.928680] gh_vm_ioctl_set_fw_name+0x3c4/0x610 [gunyah] [14977.928698] gh_vm_ioctl+0x90/0x14c [gunyah] [14977.928705] __arm64_sys_ioctl+0x184/0x210 work around by doing kvmalloc_array instead. Signed-off-by: Gao Xu Reviewed-by: Suren Baghdasaryan Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index b4526668072e..27596f3b4aef 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -43,13 +43,13 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, void *vaddr; int i; - pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); + pages = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; for (i = 0; i < count; i++) pages[i] = nth_page(page, i); vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); - kfree(pages); + kvfree(pages); return vaddr; } -- cgit v1.2.3 From 693405cf11357017ea29764e3bd9488a2d292d8f Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 7 Jun 2023 11:04:18 +0200 Subject: swiotlb: use the atomic counter of total used slabs if available If DEBUG_FS is enabled, the cost of keeping an exact number of total used slabs is already paid. In this case, there is no reason to use an inexact number for statistics and kernel messages. Signed-off-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index af2e304c672c..775f7bb10ab1 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -717,6 +717,15 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, return -1; } +#ifdef CONFIG_DEBUG_FS + +static unsigned long mem_used(struct io_tlb_mem *mem) +{ + return atomic_long_read(&mem->total_used); +} + +#else /* !CONFIG_DEBUG_FS */ + static unsigned long mem_used(struct io_tlb_mem *mem) { int i; @@ -727,6 +736,8 @@ static unsigned long mem_used(struct io_tlb_mem *mem) return used; } +#endif /* CONFIG_DEBUG_FS */ + phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, unsigned int alloc_align_mask, enum dma_data_direction dir, -- cgit v1.2.3 From 1fd96a3e9d5d4febe1a8486590ad52c048d1be77 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 5 Jun 2023 11:07:18 +0000 Subject: riscv: Add prctl controls for userspace vector management This patch add two riscv-specific prctls, to allow usespace control the use of vector unit: * PR_RISCV_V_SET_CONTROL: control the permission to use Vector at next, or all following execve for a thread. Turning off a thread's Vector live is not possible since libraries may have registered ifunc that may execute Vector instructions. * PR_RISCV_V_GET_CONTROL: get the same permission setting for the current thread, and the setting for following execve(s). Signed-off-by: Andy Chiu Reviewed-by: Greentime Hu Reviewed-by: Vincent Chen Link: https://lore.kernel.org/r/20230605110724.21391-22-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- kernel/sys.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 339fee3eff6a..05f838929e72 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -140,6 +140,12 @@ #ifndef GET_TAGGED_ADDR_CTRL # define GET_TAGGED_ADDR_CTRL() (-EINVAL) #endif +#ifndef RISCV_V_SET_CONTROL +# define RISCV_V_SET_CONTROL(a) (-EINVAL) +#endif +#ifndef RISCV_V_GET_CONTROL +# define RISCV_V_GET_CONTROL() (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2708,6 +2714,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); break; #endif + case PR_RISCV_V_SET_CONTROL: + error = RISCV_V_SET_CONTROL(arg2); + break; + case PR_RISCV_V_GET_CONTROL: + error = RISCV_V_GET_CONTROL(); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From b06e9318bfd03f531b918bd27b63f1eb88c21729 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Thu, 8 Jun 2023 09:01:18 +0530 Subject: kallsyms: move kallsyms_show_value() out of kallsyms.c function kallsyms_show_value() is used by other parts like modules_open(), kprobes_read() etc. which can work in case of !KALLSYMS also. e.g. as of now lsmod do not show module address if KALLSYMS is disabled. since kallsyms_show_value() defination is not present, it returns false in !KALLSYMS. / # lsmod test 12288 0 - Live 0x0000000000000000 (O) So kallsyms_show_value() can be made generic without dependency on KALLSYMS. Thus moving out function to a new file ksyms_common.c. With this patch code is just moved to new file and no functional change. Co-developed-by: Onkarnath Signed-off-by: Onkarnath Signed-off-by: Maninder Singh Reviewed-by: Zhen Lei Signed-off-by: Luis Chamberlain --- kernel/Makefile | 2 +- kernel/kallsyms.c | 35 ----------------------------------- kernel/ksyms_common.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 36 deletions(-) create mode 100644 kernel/ksyms_common.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index b69c95315480..e7e63bb4b7fe 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o ucount.o regset.o + async.o range.o smpboot.o ucount.o regset.o ksyms_common.o obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8193e947aa10..0f82c3d5a57d 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -907,41 +907,6 @@ late_initcall(bpf_ksym_iter_register); #endif /* CONFIG_BPF_SYSCALL */ -static inline int kallsyms_for_perf(void) -{ -#ifdef CONFIG_PERF_EVENTS - extern int sysctl_perf_event_paranoid; - if (sysctl_perf_event_paranoid <= 1) - return 1; -#endif - return 0; -} - -/* - * We show kallsyms information even to normal users if we've enabled - * kernel profiling and are explicitly not paranoid (so kptr_restrict - * is clear, and sysctl_perf_event_paranoid isn't set). - * - * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to - * block even that). - */ -bool kallsyms_show_value(const struct cred *cred) -{ - switch (kptr_restrict) { - case 0: - if (kallsyms_for_perf()) - return true; - fallthrough; - case 1: - if (security_capable(cred, &init_user_ns, CAP_SYSLOG, - CAP_OPT_NOAUDIT) == 0) - return true; - fallthrough; - default: - return false; - } -} - static int kallsyms_open(struct inode *inode, struct file *file) { /* diff --git a/kernel/ksyms_common.c b/kernel/ksyms_common.c new file mode 100644 index 000000000000..3840fa1c9c86 --- /dev/null +++ b/kernel/ksyms_common.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ksyms_common.c: A split of kernel/kallsyms.c + * Contains a few generic function definations independent of config KALLSYMS. + */ +#include +#include + +#ifdef CONFIG_KALLSYMS +static inline int kallsyms_for_perf(void) +{ +#ifdef CONFIG_PERF_EVENTS + extern int sysctl_perf_event_paranoid; + + if (sysctl_perf_event_paranoid <= 1) + return 1; +#endif + return 0; +} + +/* + * We show kallsyms information even to normal users if we've enabled + * kernel profiling and are explicitly not paranoid (so kptr_restrict + * is clear, and sysctl_perf_event_paranoid isn't set). + * + * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to + * block even that). + */ +bool kallsyms_show_value(const struct cred *cred) +{ + switch (kptr_restrict) { + case 0: + if (kallsyms_for_perf()) + return true; + fallthrough; + case 1: + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; + fallthrough; + default: + return false; + } +} +#endif -- cgit v1.2.3 From 0eeaf1eb40a34fddd1d568a9b32c3d6669238743 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Thu, 8 Jun 2023 09:01:19 +0530 Subject: kallsyms: make kallsyms_show_value() as generic function This change makes function kallsyms_show_value() as generic function without dependency on CONFIG_KALLSYMS. Now module address will be displayed with lsmod and /proc/modules. Earlier: ======= / # insmod test.ko / # lsmod test 12288 0 - Live 0x0000000000000000 (O) // No Module Load address / # With change: ========== / # insmod test.ko / # lsmod test 12288 0 - Live 0xffff800000fc0000 (O) // Module address / # cat /proc/modules test 12288 0 - Live 0xffff800000fc0000 (O) Co-developed-by: Onkarnath Signed-off-by: Onkarnath Signed-off-by: Maninder Singh Reviewed-by: Zhen Lei Signed-off-by: Luis Chamberlain --- kernel/ksyms_common.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ksyms_common.c b/kernel/ksyms_common.c index 3840fa1c9c86..cf1a73cbf2f6 100644 --- a/kernel/ksyms_common.c +++ b/kernel/ksyms_common.c @@ -6,7 +6,6 @@ #include #include -#ifdef CONFIG_KALLSYMS static inline int kallsyms_for_perf(void) { #ifdef CONFIG_PERF_EVENTS @@ -42,4 +41,3 @@ bool kallsyms_show_value(const struct cred *cred) return false; } } -#endif -- cgit v1.2.3 From 861dc0b46432a7086bc6de526aae775b4d615e28 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Sun, 28 May 2023 13:43:46 -0700 Subject: sysctl: move umh sysctl registration to its own file Move the umh sysctl registration to its own file, the array is already there. We do this to remove the clutter out of kernel/sysctl.c to avoid merge conflicts. This also lets the sysctls not be built at all now when CONFIG_SYSCTL is not enabled. This has a small penalty of 23 bytes but soon we'll be removing all the empty entries on sysctl arrays so just do this cleanup now: ./scripts/bloat-o-meter vmlinux.base vmlinux.1 add/remove: 2/0 grow/shrink: 0/1 up/down: 49/-26 (23) Function old new delta init_umh_sysctls - 33 +33 __pfx_init_umh_sysctls - 16 +16 sysctl_init_bases 111 85 -26 Total: Before=21256914, After=21256937, chg +0.00% Acked-by: Jarkko Sakkinen Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 1 - kernel/umh.c | 11 ++++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 241b817c0240..caf4a91522a1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2322,7 +2322,6 @@ static struct ctl_table vm_table[] = { int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); - register_sysctl_init("kernel/usermodehelper", usermodehelper_table); #ifdef CONFIG_KEYS register_sysctl_init("kernel/keys", key_sysctls); #endif diff --git a/kernel/umh.c b/kernel/umh.c index 60aa9e764a38..41088c5c39fd 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -544,7 +544,8 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } -struct ctl_table usermodehelper_table[] = { +#if defined(CONFIG_SYSCTL) +static struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = &usermodehelper_bset, @@ -561,3 +562,11 @@ struct ctl_table usermodehelper_table[] = { }, { } }; + +static int __init init_umh_sysctls(void) +{ + register_sysctl_init("kernel/usermodehelper", usermodehelper_table); + return 0; +} +early_initcall(init_umh_sysctls); +#endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From 28898e260a34e840f86ca80bf0c7657d76ad3f80 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Sun, 28 May 2023 13:54:20 -0700 Subject: sysctl: move security keys sysctl registration to its own file The security keys sysctls are already declared on its own file, just move the sysctl registration to its own file to help avoid merge conflicts on sysctls.c, and help with clearing up sysctl.c further. This creates a small penalty of 23 bytes: ./scripts/bloat-o-meter vmlinux.1 vmlinux.2 add/remove: 2/0 grow/shrink: 0/1 up/down: 49/-26 (23) Function old new delta init_security_keys_sysctls - 33 +33 __pfx_init_security_keys_sysctls - 16 +16 sysctl_init_bases 85 59 -26 Total: Before=21256937, After=21256960, chg +0.00% But soon we'll be saving tons of bytes anyway, as we modify the sysctl registrations to use ARRAY_SIZE and so we get rid of all the empty array elements so let's just clean this up now. Reviewed-by: Paul Moore Acked-by: Jarkko Sakkinen Acked-by: David Howells Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index caf4a91522a1..48046932d573 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2322,10 +2322,6 @@ static struct ctl_table vm_table[] = { int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); -#ifdef CONFIG_KEYS - register_sysctl_init("kernel/keys", key_sysctls); -#endif - register_sysctl_init("vm", vm_table); return 0; -- cgit v1.2.3 From 353e7300a1db928e427462f2745f9a2cd1625b3d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 12 May 2023 17:31:17 +0200 Subject: kcsan: Don't expect 64 bits atomic builtins from 32 bits architectures Activating KCSAN on a 32 bits architecture leads to the following link-time failure: LD .tmp_vmlinux.kallsyms1 powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_load': kernel/kcsan/core.c:1273: undefined reference to `__atomic_load_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_store': kernel/kcsan/core.c:1273: undefined reference to `__atomic_store_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_exchange': kernel/kcsan/core.c:1273: undefined reference to `__atomic_exchange_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_fetch_add': kernel/kcsan/core.c:1273: undefined reference to `__atomic_fetch_add_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_fetch_sub': kernel/kcsan/core.c:1273: undefined reference to `__atomic_fetch_sub_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_fetch_and': kernel/kcsan/core.c:1273: undefined reference to `__atomic_fetch_and_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_fetch_or': kernel/kcsan/core.c:1273: undefined reference to `__atomic_fetch_or_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_fetch_xor': kernel/kcsan/core.c:1273: undefined reference to `__atomic_fetch_xor_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_fetch_nand': kernel/kcsan/core.c:1273: undefined reference to `__atomic_fetch_nand_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_compare_exchange_strong': kernel/kcsan/core.c:1273: undefined reference to `__atomic_compare_exchange_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_compare_exchange_weak': kernel/kcsan/core.c:1273: undefined reference to `__atomic_compare_exchange_8' powerpc64-linux-ld: kernel/kcsan/core.o: in function `__tsan_atomic64_compare_exchange_val': kernel/kcsan/core.c:1273: undefined reference to `__atomic_compare_exchange_8' 32 bits architectures don't have 64 bits atomic builtins. Only include DEFINE_TSAN_ATOMIC_OPS(64) on 64 bits architectures. Fixes: 0f8ad5f2e934 ("kcsan: Add support for atomic builtins") Suggested-by: Marco Elver Signed-off-by: Christophe Leroy Reviewed-by: Marco Elver Acked-by: Marco Elver Signed-off-by: Michael Ellerman Link: https://msgid.link/d9c6afc28d0855240171a4e0ad9ffcdb9d07fceb.1683892665.git.christophe.leroy@csgroup.eu --- kernel/kcsan/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 5a60cc52adc0..8a7baf4e332e 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -1270,7 +1270,9 @@ static __always_inline void kcsan_atomic_builtin_memorder(int memorder) DEFINE_TSAN_ATOMIC_OPS(8); DEFINE_TSAN_ATOMIC_OPS(16); DEFINE_TSAN_ATOMIC_OPS(32); +#ifdef CONFIG_64BIT DEFINE_TSAN_ATOMIC_OPS(64); +#endif void __tsan_atomic_thread_fence(int memorder); void __tsan_atomic_thread_fence(int memorder) -- cgit v1.2.3 From 0a2dc6ac33297f8a1a65f81b633a1ea753f19f69 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Apr 2023 17:40:20 +0000 Subject: cgroup: remove cgroup_rstat_flush_atomic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous patches removed the only caller of cgroup_rstat_flush_atomic(). Remove the function and simplify the code. Link: https://lkml.kernel.org/r/20230421174020.2994750-6-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Tejun Heo Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- kernel/cgroup/rstat.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9c4c55228567..2542c21b6b6d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -171,7 +171,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __diag_pop(); /* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) +static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) { int cpu; @@ -207,9 +207,8 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) } raw_spin_unlock_irqrestore(cpu_lock, flags); - /* if @may_sleep, play nice and yield if necessary */ - if (may_sleep && (need_resched() || - spin_needbreak(&cgroup_rstat_lock))) { + /* play nice and yield if necessary */ + if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { spin_unlock_irq(&cgroup_rstat_lock); if (!cond_resched()) cpu_relax(); @@ -236,25 +235,10 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) might_sleep(); spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); + cgroup_rstat_flush_locked(cgrp); spin_unlock_irq(&cgroup_rstat_lock); } -/** - * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush() - * @cgrp: target cgroup - * - * This function can be called from any context. - */ -void cgroup_rstat_flush_atomic(struct cgroup *cgrp) -{ - unsigned long flags; - - spin_lock_irqsave(&cgroup_rstat_lock, flags); - cgroup_rstat_flush_locked(cgrp, false); - spin_unlock_irqrestore(&cgroup_rstat_lock, flags); -} - /** * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold * @cgrp: target cgroup @@ -269,7 +253,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) { might_sleep(); spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); + cgroup_rstat_flush_locked(cgrp); } /** -- cgit v1.2.3 From cf264e1329fb0307e044f7675849f9f38b44c11a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 2 May 2023 18:36:07 -0700 Subject: cachestat: implement cachestat syscall There is currently no good way to query the page cache state of large file sets and directory trees. There is mincore(), but it scales poorly: the kernel writes out a lot of bitmap data that userspace has to aggregate, when the user really doesn not care about per-page information in that case. The user also needs to mmap and unmap each file as it goes along, which can be quite slow as well. Some use cases where this information could come in handy: * Allowing database to decide whether to perform an index scan or direct table queries based on the in-memory cache state of the index. * Visibility into the writeback algorithm, for performance issues diagnostic. * Workload-aware writeback pacing: estimating IO fulfilled by page cache (and IO to be done) within a range of a file, allowing for more frequent syncing when and where there is IO capacity, and batching when there is not. * Computing memory usage of large files/directory trees, analogous to the du tool for disk usage. More information about these use cases could be found in the following thread: https://lore.kernel.org/lkml/20230315170934.GA97793@cmpxchg.org/ This patch implements a new syscall that queries cache state of a file and summarizes the number of cached pages, number of dirty pages, number of pages marked for writeback, number of (recently) evicted pages, etc. in a given range. Currently, the syscall is only wired in for x86 architecture. NAME cachestat - query the page cache statistics of a file. SYNOPSIS #include struct cachestat_range { __u64 off; __u64 len; }; struct cachestat { __u64 nr_cache; __u64 nr_dirty; __u64 nr_writeback; __u64 nr_evicted; __u64 nr_recently_evicted; }; int cachestat(unsigned int fd, struct cachestat_range *cstat_range, struct cachestat *cstat, unsigned int flags); DESCRIPTION cachestat() queries the number of cached pages, number of dirty pages, number of pages marked for writeback, number of evicted pages, number of recently evicted pages, in the bytes range given by `off` and `len`. An evicted page is a page that is previously in the page cache but has been evicted since. A page is recently evicted if its last eviction was recent enough that its reentry to the cache would indicate that it is actively being used by the system, and that there is memory pressure on the system. These values are returned in a cachestat struct, whose address is given by the `cstat` argument. The `off` and `len` arguments must be non-negative integers. If `len` > 0, the queried range is [`off`, `off` + `len`]. If `len` == 0, we will query in the range from `off` to the end of the file. The `flags` argument is unused for now, but is included for future extensibility. User should pass 0 (i.e no flag specified). Currently, hugetlbfs is not supported. Because the status of a page can change after cachestat() checks it but before it returns to the application, the returned values may contain stale information. RETURN VALUE On success, cachestat returns 0. On error, -1 is returned, and errno is set to indicate the error. ERRORS EFAULT cstat or cstat_args points to an invalid address. EINVAL invalid flags. EBADF invalid file descriptor. EOPNOTSUPP file descriptor is of a hugetlbfs file [nphamcs@gmail.com: replace rounddown logic with the existing helper] Link: https://lkml.kernel.org/r/20230504022044.3675469-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20230503013608.2431726-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Brian Foster Cc: Matthew Wilcox (Oracle) Cc: Michael Kerrisk Signed-off-by: Andrew Morton --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..04bfb1e4d377 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL(set_mempolicy_home_node); +COND_SYSCALL(cachestat); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); -- cgit v1.2.3 From 31a1b9d7fe768db521b12287ec6426983e9787e3 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:17 +0800 Subject: mm: page_alloc: move mark_free_page() into snapshot.c The mark_free_page() is only used in kernel/power/snapshot.c, move it out to reduce a bit of page_alloc.c Link: https://lkml.kernel.org/r/20230516063821.121844-10-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- kernel/power/snapshot.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cd8b7b35f1e8..45ef0bf81c85 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1228,6 +1228,58 @@ unsigned int snapshot_additional_pages(struct zone *zone) return 2 * rtree; } +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + +static void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + unsigned long flags; + unsigned int order, t; + struct page *page; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + + if (page_zone(page) != zone) + continue; + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], buddy_list) { + unsigned long i; + + pfn = page_to_pfn(page); + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} + #ifdef CONFIG_HIGHMEM /** * count_free_highmem_pages - Compute the total number of free highmem pages. -- cgit v1.2.3 From 07f44ac3c90c50a201307d3fe4dda120ee8394f5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:18 +0800 Subject: mm: page_alloc: move pm_* function into power pm_restrict_gfp_mask()/pm_restore_gfp_mask() only used in power, let's move them out of page_alloc.c. Adding a general gfp_has_io_fs() function which return true if gfp with both __GFP_IO and __GFP_FS flags, then use it inside of pm_suspended_storage(), also the pm_suspended_storage() is moved into suspend.h. Link: https://lkml.kernel.org/r/20230516063821.121844-11-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- kernel/power/main.c | 27 +++++++++++++++++++++++++++ kernel/power/power.h | 5 +++++ 2 files changed, 32 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 3113ec2f1db4..34fc8359145b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,33 @@ #include "power.h" #ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). + */ +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); +} unsigned int lock_system_sleep(void) { diff --git a/kernel/power/power.h b/kernel/power/power.h index b83c8d5e188d..ac14d1b463d1 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -216,6 +216,11 @@ static inline void suspend_test_finish(const char *label) {} /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); +void pm_restrict_gfp_mask(void); +void pm_restore_gfp_mask(void); +#else +static inline void pm_restrict_gfp_mask(void) {} +static inline void pm_restore_gfp_mask(void) {} #endif #ifdef CONFIG_HIGHMEM -- cgit v1.2.3 From e95d372c4cd46b6ec4eeacc07adcb7260ab4cfa0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:20 +0800 Subject: mm: page_alloc: move sysctls into it own fils This moves all page alloc related sysctls to its own file, as part of the kernel/sysctl.c spring cleaning, also move some functions declarations from mm.h into internal.h. Link: https://lkml.kernel.org/r/20230516063821.121844-13-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- kernel/sysctl.c | 67 --------------------------------------------------------- 1 file changed, 67 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bfe53e835524..a57de67f032f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2119,13 +2119,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif - { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, { .procname = "drop_caches", .data = &sysctl_drop_caches, @@ -2135,39 +2128,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, - { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_scale_factor", - .data = &watermark_scale_factor, - .maxlen = sizeof(watermark_scale_factor), - .mode = 0644, - .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_THREE_THOUSAND, - }, - { - .procname = "percpu_pagelist_high_fraction", - .data = &percpu_pagelist_high_fraction, - .maxlen = sizeof(percpu_pagelist_high_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, @@ -2223,24 +2183,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, #endif #ifdef CONFIG_SMP { @@ -2267,15 +2209,6 @@ static struct ctl_table vm_table[] = { .proc_handler = mmap_min_addr_handler, }, #endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { -- cgit v1.2.3 From 0b295316b3a9b7858eafbebdc31b4827a6edde03 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:36 +0100 Subject: mm/gup: remove unused vmas parameter from pin_user_pages_remote() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No invocation of pin_user_pages_remote() uses the vmas parameter, so remove it. This forms part of a larger patch set eliminating the use of the vmas parameters altogether. Link: https://lkml.kernel.org/r/28f000beb81e45bf538a2aaa77c90f5482b67a32.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Cc: Catalin Marinas Cc: Christian König Cc: Dennis Dalessandro Cc: Greg Kroah-Hartman Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sakari Ailus Cc: Sean Christopherson Signed-off-by: Andrew Morton --- kernel/trace/trace_events_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..bdc2666e8d39 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -406,7 +406,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, return -EBUSY; ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, - &page, NULL, NULL); + &page, NULL); if (unlikely(ret <= 0)) { if (!fixup_fault) -- cgit v1.2.3 From ca5e863233e8f6acd1792fd85d6bc2729a1b2c10 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:39 +0100 Subject: mm/gup: remove vmas parameter from get_user_pages_remote() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only instances of get_user_pages_remote() invocations which used the vmas parameter were for a single page which can instead simply look up the VMA directly. In particular:- - __update_ref_ctr() looked up the VMA but did nothing with it so we simply remove it. - __access_remote_vm() was already using vma_lookup() when the original lookup failed so by doing the lookup directly this also de-duplicates the code. We are able to perform these VMA operations as we already hold the mmap_lock in order to be able to call get_user_pages_remote(). As part of this work we add get_user_page_vma_remote() which abstracts the VMA lookup, error handling and decrementing the page reference count should the VMA lookup fail. This forms part of a broader set of patches intended to eliminate the vmas parameter altogether. [akpm@linux-foundation.org: avoid passing NULL to PTR_ERR] Link: https://lkml.kernel.org/r/d20128c849ecdbf4dd01cc828fcec32127ed939a.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Catalin Marinas (for arm64) Acked-by: David Hildenbrand Reviewed-by: Janosch Frank (for s390) Reviewed-by: Christoph Hellwig Cc: Christian König Cc: Dennis Dalessandro Cc: Greg Kroah-Hartman Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sakari Ailus Cc: Sean Christopherson Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 59887c69d54c..607d742caa61 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -365,7 +365,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; - struct vm_area_struct *vma; int ret; short *ptr; @@ -373,7 +372,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, - FOLL_WRITE, &page, &vma, NULL); + FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, @@ -474,10 +473,9 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, - &old_page, &vma, NULL); - if (ret <= 0) - return ret; + old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR_OR_NULL(old_page)) + return old_page ? PTR_ERR(old_page) : 0; ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) @@ -2027,8 +2025,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, - NULL, NULL); + result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); if (result < 0) return result; -- cgit v1.2.3 From 6a25212dc35897e36f83860df3cefcdf5fb5c189 Mon Sep 17 00:00:00 2001 From: Prathu Baronia Date: Tue, 2 May 2023 14:32:40 +0530 Subject: kthread: fix spelling typo and grammar in comments - `If present` -> `If present,' - `reuturn` -> `return` - `function exit safely` -> `function to exit safely` Link: https://lkml.kernel.org/r/20230502090242.3037194-1-quic_pbaronia@quicinc.com Signed-off-by: Prathu Baronia Cc: Eric W. Biederman Cc: Jason A. Donenfeld Cc: Kees Cook Cc: Petr Mladek Cc: Sami Tolvanen Cc: Tejun Heo Cc: Zqiang Signed-off-by: Andrew Morton --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 490792b1066e..08455fb48ba6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -312,10 +312,10 @@ void __noreturn kthread_exit(long result) * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * - * If present complete @comp and the reuturn code to kthread_stop(). + * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of - * @comp can use this function exit safely. + * @comp can use this function to exit safely. * * Does not return. */ -- cgit v1.2.3 From 4e2f6342ccaf9d8beb33b398cc2862769c7d2a3b Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 8 May 2023 06:44:58 +0000 Subject: fork: optimize memcg_charge_kernel_stack() a bit Since commit f1c1a9ee00e4 ("fork: Move memcg_charge_kernel_stack() into CONFIG_VMAP_STACK"), memcg_charge_kernel_stack() has been moved into CONFIG_VMAP_STACK block, so the CONFIG_VMAP_STACK check can be removed. Furthermore, memcg_charge_kernel_stack() is only invoked by alloc_thread_stack_node() instead of dup_task_struct(). If memcg_kmem_charge_page() fails, the uncharge process is handled in memcg_charge_kernel_stack() itself instead of free_thread_stack(), so remove the incorrect comments. If memcg_charge_kernel_stack() fails to charge pages used by kernel stack, only charged pages need to be uncharged. It's unnecessary to uncharge those pages which memory cgroup pointer is NULL. [akpm@linux-foundation.org: remove assertion that PAGE_SIZE is a multiple of 1k] Link: https://lkml.kernel.org/r/20230508064458.32855-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Cc: Andy Lutomirski Cc: Daniel Bristot de Oliveira Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/fork.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ed4e01daccaa..d17995934eb4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -252,23 +252,19 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) { int i; int ret; + int nr_charged = 0; - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) goto err; + nr_charged++; } return 0; err: - /* - * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is - * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will - * ignore this page. - */ - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + for (i = 0; i < nr_charged; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); return ret; } -- cgit v1.2.3 From ff7138813ac4a20628ce4b40952c69faba835fbb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 15:10:54 +0200 Subject: locking: add lockevent_read() prototype lockevent_read() has a __weak definition and the only caller in kernel/locking/lock_events.c, plus a strong definition in qspinlock_stat.h that overrides it, but no other declaration. This causes a W=1 warning: kernel/locking/lock_events.c:61:16: error: no previous prototype for 'lockevent_read' [-Werror=missing-prototypes] Add shared prototype to avoid the warnings. Link: https://lkml.kernel.org/r/20230517131102.934196-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Boqun Feng Cc: Catalin Marinas Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Paris Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Russell King Cc: Tejun Heo Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/locking/lock_events.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 8c7e7d25f09c..a6016b91803d 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -57,4 +57,8 @@ static inline void __lockevent_add(enum lock_events event, int inc) #define lockevent_cond_inc(ev, c) #endif /* CONFIG_LOCK_EVENT_COUNTS */ + +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos); + #endif /* __LOCKING_LOCK_EVENTS_H */ -- cgit v1.2.3 From 525bb813a995283bef759659cfc00f3fc2d51e81 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 15:10:55 +0200 Subject: panic: hide unused global functions Building with W=1 shows warnings about two functions that have no declaration or caller in certain configurations: kernel/panic.c:688:6: error: no previous prototype for 'warn_slowpath_fmt' [-Werror=missing-prototypes] kernel/panic.c:710:6: error: no previous prototype for '__warn_printk' [-Werror=missing-prototypes] Enclose the definition in the same #ifdef check as the declaration. Link: https://lkml.kernel.org/r/20230517131102.934196-8-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Boqun Feng Cc: Catalin Marinas Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Paris Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Russell King Cc: Tejun Heo Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/panic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 886d2ebd0a0d..10effe40a3fa 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -684,6 +684,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, add_taint(taint, LOCKDEP_STILL_OK); } +#ifdef CONFIG_BUG #ifndef __WARN_FLAGS void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) @@ -722,8 +723,6 @@ void __warn_printk(const char *fmt, ...) EXPORT_SYMBOL(__warn_printk); #endif -#ifdef CONFIG_BUG - /* Support resetting WARN*_ONCE state */ static int clear_warn_once_set(void *data, u64 val) -- cgit v1.2.3 From e0ddec73fd4822d2ffe914d5ce3e2718f985276a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 14:49:25 +0200 Subject: kcov: add prototypes for helper functions A number of internal functions in kcov are only called from generated code and don't technically need a declaration, but 'make W=1' warns about global symbols without a prototype: kernel/kcov.c:199:14: error: no previous prototype for '__sanitizer_cov_trace_pc' [-Werror=missing-prototypes] kernel/kcov.c:264:14: error: no previous prototype for '__sanitizer_cov_trace_cmp1' [-Werror=missing-prototypes] kernel/kcov.c:270:14: error: no previous prototype for '__sanitizer_cov_trace_cmp2' [-Werror=missing-prototypes] kernel/kcov.c:276:14: error: no previous prototype for '__sanitizer_cov_trace_cmp4' [-Werror=missing-prototypes] kernel/kcov.c:282:14: error: no previous prototype for '__sanitizer_cov_trace_cmp8' [-Werror=missing-prototypes] kernel/kcov.c:288:14: error: no previous prototype for '__sanitizer_cov_trace_const_cmp1' [-Werror=missing-prototypes] kernel/kcov.c:295:14: error: no previous prototype for '__sanitizer_cov_trace_const_cmp2' [-Werror=missing-prototypes] kernel/kcov.c:302:14: error: no previous prototype for '__sanitizer_cov_trace_const_cmp4' [-Werror=missing-prototypes] kernel/kcov.c:309:14: error: no previous prototype for '__sanitizer_cov_trace_const_cmp8' [-Werror=missing-prototypes] kernel/kcov.c:316:14: error: no previous prototype for '__sanitizer_cov_trace_switch' [-Werror=missing-prototypes] Adding prototypes for these in a header solves that problem, but now there is a mismatch between the built-in type and the prototype on 64-bit architectures because they expect some functions to take a 64-bit 'unsigned long' argument rather than an 'unsigned long long' u64 type: include/linux/kcov.h:84:6: error: conflicting types for built-in function '__sanitizer_cov_trace_switch'; expected 'void(long long unsigned int, void *)' [-Werror=builtin-declaration-mismatch] 84 | void __sanitizer_cov_trace_switch(u64 val, u64 *cases); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ Avoid this as well with a custom type definition. Link: https://lkml.kernel.org/r/20230517124944.929997-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Rong Tao Signed-off-by: Andrew Morton --- kernel/kcov.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 84c717337df0..f9ac2e9e460f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -279,7 +279,7 @@ void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4); -void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2) +void notrace __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2) { write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_); } @@ -306,16 +306,17 @@ void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4); -void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2) +void notrace __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2) { write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8); -void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) +void notrace __sanitizer_cov_trace_switch(kcov_u64 val, void *arg) { u64 i; + u64 *cases = arg; u64 count = cases[0]; u64 size = cases[1]; u64 type = KCOV_CMP_CONST; -- cgit v1.2.3 From 4379e59fe5665cfda737e45b8bf2f05321ef049c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:26 -0700 Subject: watchdog/perf: more properly prevent false positives with turbo modes Currently, in the watchdog_overflow_callback() we first check to see if the watchdog had been touched and _then_ we handle the workaround for turbo mode. This order should be reversed. Specifically, "touching" the hardlockup detector's watchdog should avoid lockups being detected for one period that should be roughly the same regardless of whether we're running turbo or not. That means that we should do the extra accounting for turbo _before_ we look at (and clear) the global indicating that we've been touched. NOTE: this fix is made based on code inspection. I am not aware of any reports where the old code would have generated false positives. That being said, this order seems more correct and also makes it easier down the line to share code with the "buddy" hardlockup detector. Link: https://lkml.kernel.org/r/20230519101840.v5.2.I843b0d1de3e096ba111a179f3adb16d576bef5c7@changeid Fixes: 7edaeb6841df ("kernel/watchdog: Prevent false positives with turbo modes") Signed-off-by: Douglas Anderson Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Petr Mladek Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog_hld.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 247bf0b1582c..1e8a49dc956e 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -114,14 +114,14 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; + if (!watchdog_check_timestamp()) + return; + if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); return; } - if (!watchdog_check_timestamp()) - return; - /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have -- cgit v1.2.3 From 810b560e8985725dbd57bbb3f188c231365eb5ae Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Fri, 19 May 2023 10:18:27 -0700 Subject: watchdog: remove WATCHDOG_DEFAULT No reference to WATCHDOG_DEFAULT, remove it. Link: https://lkml.kernel.org/r/20230519101840.v5.3.I6a729209a1320e0ad212176e250ff945b8f91b2a@changeid Signed-off-by: Pingfan Liu Signed-off-by: Lecopzer Chen Signed-off-by: Douglas Anderson Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8e61f21e7e33..582d572e1379 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -30,10 +30,8 @@ static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) # define NMI_WATCHDOG_DEFAULT 1 #else -# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED) # define NMI_WATCHDOG_DEFAULT 0 #endif -- cgit v1.2.3 From 730211182ed083898fa5feb4b28459ffac4c9615 Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Fri, 19 May 2023 10:18:28 -0700 Subject: watchdog/hardlockup: change watchdog_nmi_enable() to void Nobody cares about the return value of watchdog_nmi_enable(), changing its prototype to void. Link: https://lkml.kernel.org/r/20230519101840.v5.4.Ic3a19b592eb1ac4c6f6eade44ffd943e8637b6e5@changeid Signed-off-by: Pingfan Liu Signed-off-by: Lecopzer Chen Signed-off-by: Douglas Anderson Reviewed-by: Petr Mladek Acked-by: Nicholas Piggin Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 582d572e1379..c705a18b26bf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -93,10 +93,9 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); * softlockup watchdog start and stop. The arch must select the * SOFTLOCKUP_DETECTOR Kconfig. */ -int __weak watchdog_nmi_enable(unsigned int cpu) +void __weak watchdog_nmi_enable(unsigned int cpu) { hardlockup_detector_perf_enable(); - return 0; } void __weak watchdog_nmi_disable(unsigned int cpu) -- cgit v1.2.3 From 1fafaa7745eeeeffef7155ab5eeb8cc83d04874f Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Fri, 19 May 2023 10:18:29 -0700 Subject: watchdog/perf: ensure CPU-bound context when creating hardlockup detector event hardlockup_detector_event_create() should create perf_event on the current CPU. Preemption could not get disabled because perf_event_create_kernel_counter() allocates memory. Instead, the CPU locality is achieved by processing the code in a per-CPU bound kthread. Add a check to prevent mistakes when calling the code in another code path. Link: https://lkml.kernel.org/r/20230519101840.v5.5.I654063e53782b11d53e736a8ad4897ffd207406a@changeid Signed-off-by: Pingfan Liu Co-developed-by: Lecopzer Chen Signed-off-by: Lecopzer Chen Signed-off-by: Douglas Anderson Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog_hld.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 1e8a49dc956e..2125b09e09d7 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -165,10 +165,16 @@ static void watchdog_overflow_callback(struct perf_event *event, static int hardlockup_detector_event_create(void) { - unsigned int cpu = smp_processor_id(); + unsigned int cpu; struct perf_event_attr *wd_attr; struct perf_event *evt; + /* + * Preemption is not disabled because memory will be allocated. + * Ensure CPU-locality by calling this in per-CPU kthread. + */ + WARN_ON(!is_percpu_thread()); + cpu = raw_smp_processor_id(); wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); -- cgit v1.2.3 From 6ea0d04211a7715a926f5dca4aa1065614fa64da Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:31 -0700 Subject: watchdog/perf: rename watchdog_hld.c to watchdog_perf.c The code currently in "watchdog_hld.c" is for detecting hardlockups using perf, as evidenced by the line in the Makefile that only compiles this file if CONFIG_HARDLOCKUP_DETECTOR_PERF is defined. Rename the file to prepare for the buddy hardlockup detector, which doesn't use perf. It could be argued that the new name makes it less obvious that this is a hardlockup detector. While true, it's not hard to remember that the "perf" detector is always a hardlockup detector and it's nice not to have names that are too convoluted. Link: https://lkml.kernel.org/r/20230519101840.v5.7.Ice803cb078d0e15fb2cbf49132f096ee2bd4199d@changeid Signed-off-by: Douglas Anderson Acked-by: Nicholas Piggin Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/Makefile | 2 +- kernel/watchdog_hld.c | 302 ------------------------------------------------- kernel/watchdog_perf.c | 302 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 303 insertions(+), 303 deletions(-) delete mode 100644 kernel/watchdog_hld.c create mode 100644 kernel/watchdog_perf.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index b69c95315480..7eb72033143c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,7 +91,7 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c deleted file mode 100644 index 2125b09e09d7..000000000000 --- a/kernel/watchdog_hld.c +++ /dev/null @@ -1,302 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Detect hard lockups on a system - * - * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. - * - * Note: Most of this code is borrowed heavily from the original softlockup - * detector, so thanks to Ingo for the initial implementation. - * Some chunks also taken from the old x86-specific nmi watchdog code, thanks - * to those contributors as well. - */ - -#define pr_fmt(fmt) "NMI watchdog: " fmt - -#include -#include -#include -#include - -#include -#include - -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); -static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); -static struct cpumask dead_events_mask; - -static unsigned long hardlockup_allcpu_dumped; -static atomic_t watchdog_cpus = ATOMIC_INIT(0); - -notrace void arch_touch_nmi_watchdog(void) -{ - /* - * Using __raw here because some code paths have - * preemption enabled. If preemption is enabled - * then interrupts should be enabled too, in which - * case we shouldn't have to worry about the watchdog - * going off. - */ - raw_cpu_write(watchdog_nmi_touch, true); -} -EXPORT_SYMBOL(arch_touch_nmi_watchdog); - -#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP -static DEFINE_PER_CPU(ktime_t, last_timestamp); -static DEFINE_PER_CPU(unsigned int, nmi_rearmed); -static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; - -void watchdog_update_hrtimer_threshold(u64 period) -{ - /* - * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 - * - * So it runs effectively with 2.5 times the rate of the NMI - * watchdog. That means the hrtimer should fire 2-3 times before - * the NMI watchdog expires. The NMI watchdog on x86 is based on - * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles - * might run way faster than expected and the NMI fires in a - * smaller period than the one deduced from the nominal CPU - * frequency. Depending on the Turbo-Mode factor this might be fast - * enough to get the NMI period smaller than the hrtimer watchdog - * period and trigger false positives. - * - * The sample threshold is used to check in the NMI handler whether - * the minimum time between two NMI samples has elapsed. That - * prevents false positives. - * - * Set this to 4/5 of the actual watchdog threshold period so the - * hrtimer is guaranteed to fire at least once within the real - * watchdog threshold. - */ - watchdog_hrtimer_sample_threshold = period * 2; -} - -static bool watchdog_check_timestamp(void) -{ - ktime_t delta, now = ktime_get_mono_fast_ns(); - - delta = now - __this_cpu_read(last_timestamp); - if (delta < watchdog_hrtimer_sample_threshold) { - /* - * If ktime is jiffies based, a stalled timer would prevent - * jiffies from being incremented and the filter would look - * at a stale timestamp and never trigger. - */ - if (__this_cpu_inc_return(nmi_rearmed) < 10) - return false; - } - __this_cpu_write(nmi_rearmed, 0); - __this_cpu_write(last_timestamp, now); - return true; -} -#else -static inline bool watchdog_check_timestamp(void) -{ - return true; -} -#endif - -static struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -/* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - /* Ensure the watchdog never gets throttled */ - event->hw.interrupts = 0; - - if (!watchdog_check_timestamp()) - return; - - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", - this_cpu); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) - trigger_allbutself_cpu_backtrace(); - - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; -} - -static int hardlockup_detector_event_create(void) -{ - unsigned int cpu; - struct perf_event_attr *wd_attr; - struct perf_event *evt; - - /* - * Preemption is not disabled because memory will be allocated. - * Ensure CPU-locality by calling this in per-CPU kthread. - */ - WARN_ON(!is_percpu_thread()); - cpu = raw_smp_processor_id(); - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, - watchdog_overflow_callback, NULL); - if (IS_ERR(evt)) { - pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, - PTR_ERR(evt)); - return PTR_ERR(evt); - } - this_cpu_write(watchdog_ev, evt); - return 0; -} - -/** - * hardlockup_detector_perf_enable - Enable the local event - */ -void hardlockup_detector_perf_enable(void) -{ - if (hardlockup_detector_event_create()) - return; - - /* use original value for check */ - if (!atomic_fetch_inc(&watchdog_cpus)) - pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); - - perf_event_enable(this_cpu_read(watchdog_ev)); -} - -/** - * hardlockup_detector_perf_disable - Disable the local event - */ -void hardlockup_detector_perf_disable(void) -{ - struct perf_event *event = this_cpu_read(watchdog_ev); - - if (event) { - perf_event_disable(event); - this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); - cpumask_set_cpu(smp_processor_id(), &dead_events_mask); - atomic_dec(&watchdog_cpus); - } -} - -/** - * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them - * - * Called from lockup_detector_cleanup(). Serialized by the caller. - */ -void hardlockup_detector_perf_cleanup(void) -{ - int cpu; - - for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); - - /* - * Required because for_each_cpu() reports unconditionally - * CPU0 as set on UP kernels. Sigh. - */ - if (event) - perf_event_release_kernel(event); - per_cpu(dead_event, cpu) = NULL; - } - cpumask_clear(&dead_events_mask); -} - -/** - * hardlockup_detector_perf_stop - Globally stop watchdog events - * - * Special interface for x86 to handle the perf HT bug. - */ -void __init hardlockup_detector_perf_stop(void) -{ - int cpu; - - lockdep_assert_cpus_held(); - - for_each_online_cpu(cpu) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - if (event) - perf_event_disable(event); - } -} - -/** - * hardlockup_detector_perf_restart - Globally restart watchdog events - * - * Special interface for x86 to handle the perf HT bug. - */ -void __init hardlockup_detector_perf_restart(void) -{ - int cpu; - - lockdep_assert_cpus_held(); - - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - return; - - for_each_online_cpu(cpu) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - if (event) - perf_event_enable(event); - } -} - -/** - * hardlockup_detector_perf_init - Probe whether NMI event is available at all - */ -int __init hardlockup_detector_perf_init(void) -{ - int ret = hardlockup_detector_event_create(); - - if (ret) { - pr_info("Perf NMI watchdog permanently disabled\n"); - } else { - perf_event_release_kernel(this_cpu_read(watchdog_ev)); - this_cpu_write(watchdog_ev, NULL); - } - return ret; -} diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c new file mode 100644 index 000000000000..8b8015758ea5 --- /dev/null +++ b/kernel/watchdog_perf.c @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Detect hard lockups on a system using perf + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks + * to those contributors as well. + */ + +#define pr_fmt(fmt) "NMI watchdog: " fmt + +#include +#include +#include +#include + +#include +#include + +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); +static struct cpumask dead_events_mask; + +static unsigned long hardlockup_allcpu_dumped; +static atomic_t watchdog_cpus = ATOMIC_INIT(0); + +notrace void arch_touch_nmi_watchdog(void) +{ + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + raw_cpu_write(watchdog_nmi_touch, true); +} +EXPORT_SYMBOL(arch_touch_nmi_watchdog); + +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif + +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +static void watchdog_overflow_callback(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + + if (!watchdog_check_timestamp()) + return; + + if (__this_cpu_read(watchdog_nmi_touch) == true) { + __this_cpu_write(watchdog_nmi_touch, false); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__this_cpu_read(hard_watchdog_warn) == true) + return; + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", + this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + + __this_cpu_write(hard_watchdog_warn, true); + return; + } + + __this_cpu_write(hard_watchdog_warn, false); + return; +} + +static int hardlockup_detector_event_create(void) +{ + unsigned int cpu; + struct perf_event_attr *wd_attr; + struct perf_event *evt; + + /* + * Preemption is not disabled because memory will be allocated. + * Ensure CPU-locality by calling this in per-CPU kthread. + */ + WARN_ON(!is_percpu_thread()); + cpu = raw_smp_processor_id(); + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); + return PTR_ERR(evt); + } + this_cpu_write(watchdog_ev, evt); + return 0; +} + +/** + * hardlockup_detector_perf_enable - Enable the local event + */ +void hardlockup_detector_perf_enable(void) +{ + if (hardlockup_detector_event_create()) + return; + + /* use original value for check */ + if (!atomic_fetch_inc(&watchdog_cpus)) + pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); + + perf_event_enable(this_cpu_read(watchdog_ev)); +} + +/** + * hardlockup_detector_perf_disable - Disable the local event + */ +void hardlockup_detector_perf_disable(void) +{ + struct perf_event *event = this_cpu_read(watchdog_ev); + + if (event) { + perf_event_disable(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); + cpumask_set_cpu(smp_processor_id(), &dead_events_mask); + atomic_dec(&watchdog_cpus); + } +} + +/** + * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them + * + * Called from lockup_detector_cleanup(). Serialized by the caller. + */ +void hardlockup_detector_perf_cleanup(void) +{ + int cpu; + + for_each_cpu(cpu, &dead_events_mask) { + struct perf_event *event = per_cpu(dead_event, cpu); + + /* + * Required because for_each_cpu() reports unconditionally + * CPU0 as set on UP kernels. Sigh. + */ + if (event) + perf_event_release_kernel(event); + per_cpu(dead_event, cpu) = NULL; + } + cpumask_clear(&dead_events_mask); +} + +/** + * hardlockup_detector_perf_stop - Globally stop watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_stop(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_disable(event); + } +} + +/** + * hardlockup_detector_perf_restart - Globally restart watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_restart(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return; + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_enable(event); + } +} + +/** + * hardlockup_detector_perf_init - Probe whether NMI event is available at all + */ +int __init hardlockup_detector_perf_init(void) +{ + int ret = hardlockup_detector_event_create(); + + if (ret) { + pr_info("Perf NMI watchdog permanently disabled\n"); + } else { + perf_event_release_kernel(this_cpu_read(watchdog_ev)); + this_cpu_write(watchdog_ev, NULL); + } + return ret; +} -- cgit v1.2.3 From 81972551df9d168a8183b786ff4de06008469c2e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:32 -0700 Subject: watchdog/hardlockup: move perf hardlockup checking/panic to common watchdog.c The perf hardlockup detector works by looking at interrupt counts and seeing if they change from run to run. The interrupt counts are managed by the common watchdog code via its watchdog_timer_fn(). Currently the API between the perf detector and the common code is a function: is_hardlockup(). When the hard lockup detector sees that function return true then it handles printing out debug info and inducing a panic if necessary. Let's change the API a little bit in preparation for the buddy hardlockup detector. The buddy hardlockup detector wants to print nearly the same debug info and have nearly the same panic behavior. That means we want to move all that code to the common file. For now, the code in the common file will only be there if the perf hardlockup detector is enabled, but eventually it will be selected by a common config. Right now, this _just_ moves the code from the perf detector file to the common file and changes the names. It doesn't make the changes that the buddy hardlockup detector will need and doesn't do any style cleanups. A future patch will do cleanup to make it more obvious what changed. With the above, we no longer have any callers of is_hardlockup() outside of the "watchdog.c" file, so we can remove it from the header, make it static, and move it to the same "#ifdef" block as our new watchdog_hardlockup_check(). While doing this, it can be noted that even if no hardlockup detectors were configured the existing code used to still have the code for counting/checking "hrtimer_interrupts" even if the perf hardlockup detector wasn't configured. We didn't need to do that, so move all the "hrtimer_interrupts" counting to only be there if the perf hardlockup detector is configured as well. This change is expected to be a no-op. Link: https://lkml.kernel.org/r/20230519101840.v5.8.Id4133d3183e798122dc3b6205e7852601f289071@changeid Signed-off-by: Douglas Anderson Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 93 +++++++++++++++++++++++++++++++++++++++----------- kernel/watchdog_perf.c | 42 +---------------------- 2 files changed, 74 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c705a18b26bf..12ce37d76e7d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -85,6 +85,78 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); #endif /* CONFIG_HARDLOCKUP_DETECTOR */ +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) + +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static unsigned long hardlockup_allcpu_dumped; + +static bool is_hardlockup(void) +{ + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); + + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return true; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; +} + +static void watchdog_hardlockup_kick(void) +{ + __this_cpu_inc(hrtimer_interrupts); +} + +void watchdog_hardlockup_check(struct pt_regs *regs) +{ + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__this_cpu_read(hard_watchdog_warn) == true) + return; + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", + this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + + __this_cpu_write(hard_watchdog_warn, true); + return; + } + + __this_cpu_write(hard_watchdog_warn, false); + return; +} + +#else /* CONFIG_HARDLOCKUP_DETECTOR_PERF */ + +static inline void watchdog_hardlockup_kick(void) { } + +#endif /* !CONFIG_HARDLOCKUP_DETECTOR_PERF */ + /* * These functions can be overridden if an architecture implements its * own hardlockup detector. @@ -176,8 +248,6 @@ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; static int __init nowatchdog_setup(char *str) @@ -312,22 +382,6 @@ static int is_softlockup(unsigned long touch_ts, } /* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - -static void watchdog_interrupt_count(void) -{ - __this_cpu_inc(hrtimer_interrupts); -} - static DEFINE_PER_CPU(struct completion, softlockup_completion); static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); @@ -358,8 +412,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (!watchdog_enabled) return HRTIMER_NORESTART; - /* kick the hardlockup detector */ - watchdog_interrupt_count(); + watchdog_hardlockup_kick(); /* kick the softlockup detector */ if (completion_done(this_cpu_ptr(&softlockup_completion))) { diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 8b8015758ea5..04415812d079 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -20,13 +20,11 @@ #include #include -static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; -static unsigned long hardlockup_allcpu_dumped; static atomic_t watchdog_cpus = ATOMIC_INIT(0); notrace void arch_touch_nmi_watchdog(void) @@ -122,45 +120,7 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", - this_cpu); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) - trigger_allbutself_cpu_backtrace(); - - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; + watchdog_hardlockup_check(regs); } static int hardlockup_detector_event_create(void) -- cgit v1.2.3 From 1610611aadc2241179e7090ba2f3fd0d763d9932 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:33 -0700 Subject: watchdog/hardlockup: style changes to watchdog_hardlockup_check() / is_hardlockup() These are tiny style changes: - Add a blank line before a "return". - Renames two globals to use the "watchdog_hardlockup" prefix. - Store processor id in "unsigned int" rather than "int". - Minor comment rewording. - Use "else" rather than extra returns since it seemed more symmetric. Link: https://lkml.kernel.org/r/20230519101840.v5.9.I818492c326b632560b09f20d2608455ecf9d3650@changeid Reviewed-by: Petr Mladek Signed-off-by: Douglas Anderson Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 12ce37d76e7d..169e5dffbc00 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -89,8 +89,8 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static unsigned long hardlockup_allcpu_dumped; +static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); +static unsigned long watchdog_hardlockup_all_cpu_dumped; static bool is_hardlockup(void) { @@ -100,6 +100,7 @@ static bool is_hardlockup(void) return true; __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; } @@ -110,21 +111,20 @@ static void watchdog_hardlockup_kick(void) void watchdog_hardlockup_check(struct pt_regs *regs) { - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't + /* + * Check for a hardlockup by making sure the CPU's timer + * interrupt is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ if (is_hardlockup()) { - int this_cpu = smp_processor_id(); + unsigned int this_cpu = smp_processor_id(); - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) + /* Only print hardlockups once. */ + if (__this_cpu_read(watchdog_hardlockup_warned)) return; - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", - this_cpu); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); print_modules(); print_irqtrace_events(current); if (regs) @@ -137,18 +137,16 @@ void watchdog_hardlockup_check(struct pt_regs *regs) * generating interleaving traces */ if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); - __this_cpu_write(hard_watchdog_warn, true); - return; + __this_cpu_write(watchdog_hardlockup_warned, true); + } else { + __this_cpu_write(watchdog_hardlockup_warned, false); } - - __this_cpu_write(hard_watchdog_warn, false); - return; } #else /* CONFIG_HARDLOCKUP_DETECTOR_PERF */ -- cgit v1.2.3 From 77c12fc95980d100fdc49e88a5727c242d0dfedc Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:34 -0700 Subject: watchdog/hardlockup: add a "cpu" param to watchdog_hardlockup_check() In preparation for the buddy hardlockup detector where the CPU checking for lockup might not be the currently running CPU, add a "cpu" parameter to watchdog_hardlockup_check(). As part of this change, make hrtimer_interrupts an atomic_t since now the CPU incrementing the value and the CPU reading the value might be different. Technially this could also be done with just READ_ONCE and WRITE_ONCE, but atomic_t feels a little cleaner in this case. While hrtimer_interrupts is made atomic_t, we change hrtimer_interrupts_saved from "unsigned long" to "int". The "int" is needed to match the data type backing atomic_t for hrtimer_interrupts. Even if this changes us from 64-bits to 32-bits (which I don't think is true for most compilers), it doesn't really matter. All we ever do is increment it every few seconds and compare it to an old value so 32-bits is fine (even 16-bits would be). The "signed" vs "unsigned" also doesn't matter for simple equality comparisons. hrtimer_interrupts_saved is _not_ switched to atomic_t nor even accessed with READ_ONCE / WRITE_ONCE. The hrtimer_interrupts_saved is always consistently accessed with the same CPU. NOTE: with the upcoming "buddy" detector there is one special case. When a CPU goes offline/online then we can change which CPU is the one to consistently access a given instance of hrtimer_interrupts_saved. We still can't end up with a partially updated hrtimer_interrupts_saved, however, because we end up petting all affected CPUs to make sure the new and old CPU can't end up somehow read/write hrtimer_interrupts_saved at the same time. Link: https://lkml.kernel.org/r/20230519101840.v5.10.I3a7d4dd8c23ac30ee0b607d77feb6646b64825c0@changeid Signed-off-by: Douglas Anderson Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Petr Mladek Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 52 +++++++++++++++++++++++++++++++------------------- kernel/watchdog_perf.c | 2 +- 2 files changed, 33 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 169e5dffbc00..2552e224f76a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -87,29 +87,34 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); +static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static unsigned long watchdog_hardlockup_all_cpu_dumped; -static bool is_hardlockup(void) +static bool is_hardlockup(unsigned int cpu) { - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); + int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) return true; - __this_cpu_write(hrtimer_interrupts_saved, hrint); + /* + * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE + * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is + * written/read by a single CPU. + */ + per_cpu(hrtimer_interrupts_saved, cpu) = hrint; return false; } static void watchdog_hardlockup_kick(void) { - __this_cpu_inc(hrtimer_interrupts); + atomic_inc(raw_cpu_ptr(&hrtimer_interrupts)); } -void watchdog_hardlockup_check(struct pt_regs *regs) +void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { /* * Check for a hardlockup by making sure the CPU's timer @@ -117,35 +122,42 @@ void watchdog_hardlockup_check(struct pt_regs *regs) * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ - if (is_hardlockup()) { + if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); + struct cpumask backtrace_mask = *cpu_online_mask; /* Only print hardlockups once. */ - if (__this_cpu_read(watchdog_hardlockup_warned)) + if (per_cpu(watchdog_hardlockup_warned, cpu)) return; - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); print_modules(); print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); + if (cpu == this_cpu) { + if (regs) + show_regs(regs); + else + dump_stack(); + cpumask_clear_cpu(cpu, &backtrace_mask); + } else { + if (trigger_single_cpu_backtrace(cpu)) + cpumask_clear_cpu(cpu, &backtrace_mask); + } /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces + * Perform multi-CPU dump only once to avoid multiple + * hardlockups generating interleaving traces */ if (sysctl_hardlockup_all_cpu_backtrace && !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) - trigger_allbutself_cpu_backtrace(); + trigger_cpumask_backtrace(&backtrace_mask); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); - __this_cpu_write(watchdog_hardlockup_warned, true); + per_cpu(watchdog_hardlockup_warned, cpu) = true; } else { - __this_cpu_write(watchdog_hardlockup_warned, false); + per_cpu(watchdog_hardlockup_warned, cpu) = false; } } diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 04415812d079..4e60e8023515 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -120,7 +120,7 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } - watchdog_hardlockup_check(regs); + watchdog_hardlockup_check(smp_processor_id(), regs); } static int hardlockup_detector_event_create(void) -- cgit v1.2.3 From ed92e1ef52224c7c9c15fba559448396b059c2ee Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:35 -0700 Subject: watchdog/hardlockup: move perf hardlockup watchdog petting to watchdog.c In preparation for the buddy hardlockup detector, which wants the same petting logic as the current perf hardlockup detector, move the code to watchdog.c. While doing this, rename the global variable to match others nearby. As part of this change we have to change the code to account for the fact that the CPU we're running on might be different than the one we're checking. Currently the code in watchdog.c is guarded by CONFIG_HARDLOCKUP_DETECTOR_PERF, which makes this change seem silly. However, a future patch will change this. Link: https://lkml.kernel.org/r/20230519101840.v5.11.I00dfd6386ee00da25bf26d140559a41339b53e57@changeid Signed-off-by: Douglas Anderson Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Petr Mladek Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 19 +++++++++++++++++++ kernel/watchdog_perf.c | 19 ------------------- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 2552e224f76a..64d7d2a0a7df 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -90,8 +90,22 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); +static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); static unsigned long watchdog_hardlockup_all_cpu_dumped; +notrace void arch_touch_nmi_watchdog(void) +{ + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + raw_cpu_write(watchdog_hardlockup_touched, true); +} +EXPORT_SYMBOL(arch_touch_nmi_watchdog); + static bool is_hardlockup(unsigned int cpu) { int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); @@ -116,6 +130,11 @@ static void watchdog_hardlockup_kick(void) void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { + if (per_cpu(watchdog_hardlockup_touched, cpu)) { + per_cpu(watchdog_hardlockup_touched, cpu) = false; + return; + } + /* * Check for a hardlockup by making sure the CPU's timer * interrupt is incrementing. The timer interrupt should have diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 4e60e8023515..547917ebd5d3 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -20,26 +20,12 @@ #include #include -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static atomic_t watchdog_cpus = ATOMIC_INIT(0); -notrace void arch_touch_nmi_watchdog(void) -{ - /* - * Using __raw here because some code paths have - * preemption enabled. If preemption is enabled - * then interrupts should be enabled too, in which - * case we shouldn't have to worry about the watchdog - * going off. - */ - raw_cpu_write(watchdog_nmi_touch, true); -} -EXPORT_SYMBOL(arch_touch_nmi_watchdog); - #ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP static DEFINE_PER_CPU(ktime_t, last_timestamp); static DEFINE_PER_CPU(unsigned int, nmi_rearmed); @@ -115,11 +101,6 @@ static void watchdog_overflow_callback(struct perf_event *event, if (!watchdog_check_timestamp()) return; - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - watchdog_hardlockup_check(smp_processor_id(), regs); } -- cgit v1.2.3 From df95d3085caa5b99a60eb033d7ad6c2ff2b43dbf Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:36 -0700 Subject: watchdog/hardlockup: rename some "NMI watchdog" constants/function Do a search and replace of: - NMI_WATCHDOG_ENABLED => WATCHDOG_HARDLOCKUP_ENABLED - SOFT_WATCHDOG_ENABLED => WATCHDOG_SOFTOCKUP_ENABLED - watchdog_nmi_ => watchdog_hardlockup_ - nmi_watchdog_available => watchdog_hardlockup_available - nmi_watchdog_user_enabled => watchdog_hardlockup_user_enabled - soft_watchdog_user_enabled => watchdog_softlockup_user_enabled - NMI_WATCHDOG_DEFAULT => WATCHDOG_HARDLOCKUP_DEFAULT Then update a few comments near where names were changed. This is specifically to make it less confusing when we want to introduce the buddy hardlockup detector, which isn't using NMIs. As part of this, we sanitized a few names for consistency. [trix@redhat.com: make variables static] Link: https://lkml.kernel.org/r/20230525162822.1.I0fb41d138d158c9230573eaa37dc56afa2fb14ee@changeid Link: https://lkml.kernel.org/r/20230519101840.v5.12.I91f7277bab4bf8c0cb238732ed92e7ce7bbd71a6@changeid Signed-off-by: Douglas Anderson Signed-off-by: Tom Rix Reviewed-by: Tom Rix Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Petr Mladek Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 113 +++++++++++++++++++++++++------------------------ kernel/watchdog_perf.c | 2 +- 2 files changed, 58 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 64d7d2a0a7df..c6790c7dc08d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -30,17 +30,17 @@ static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -# define NMI_WATCHDOG_DEFAULT 1 +# define WATCHDOG_HARDLOCKUP_DEFAULT 1 #else -# define NMI_WATCHDOG_DEFAULT 0 +# define WATCHDOG_HARDLOCKUP_DEFAULT 0 #endif unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; -int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; -int __read_mostly soft_watchdog_user_enabled = 1; +static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; +static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; -static int __read_mostly nmi_watchdog_available; +static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -66,7 +66,7 @@ unsigned int __read_mostly hardlockup_panic = */ void __init hardlockup_detector_disable(void) { - nmi_watchdog_user_enabled = 0; + watchdog_hardlockup_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) @@ -76,9 +76,9 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - nmi_watchdog_user_enabled = 0; + watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) - nmi_watchdog_user_enabled = 1; + watchdog_hardlockup_user_enabled = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -190,40 +190,40 @@ static inline void watchdog_hardlockup_kick(void) { } * These functions can be overridden if an architecture implements its * own hardlockup detector. * - * watchdog_nmi_enable/disable can be implemented to start and stop when + * watchdog_hardlockup_enable/disable can be implemented to start and stop when * softlockup watchdog start and stop. The arch must select the * SOFTLOCKUP_DETECTOR Kconfig. */ -void __weak watchdog_nmi_enable(unsigned int cpu) +void __weak watchdog_hardlockup_enable(unsigned int cpu) { hardlockup_detector_perf_enable(); } -void __weak watchdog_nmi_disable(unsigned int cpu) +void __weak watchdog_hardlockup_disable(unsigned int cpu) { hardlockup_detector_perf_disable(); } -/* Return 0, if a NMI watchdog is available. Error code otherwise */ -int __weak __init watchdog_nmi_probe(void) +/* Return 0, if a hardlockup watchdog is available. Error code otherwise */ +int __weak __init watchdog_hardlockup_probe(void) { return hardlockup_detector_perf_init(); } /** - * watchdog_nmi_stop - Stop the watchdog for reconfiguration + * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration * * The reconfiguration steps are: - * watchdog_nmi_stop(); + * watchdog_hardlockup_stop(); * update_variables(); - * watchdog_nmi_start(); + * watchdog_hardlockup_start(); */ -void __weak watchdog_nmi_stop(void) { } +void __weak watchdog_hardlockup_stop(void) { } /** - * watchdog_nmi_start - Start the watchdog after reconfiguration + * watchdog_hardlockup_start - Start the watchdog after reconfiguration * - * Counterpart to watchdog_nmi_stop(). + * Counterpart to watchdog_hardlockup_stop(). * * The following variables have been updated in update_variables() and * contain the currently valid configuration: @@ -231,23 +231,23 @@ void __weak watchdog_nmi_stop(void) { } * - watchdog_thresh * - watchdog_cpumask */ -void __weak watchdog_nmi_start(void) { } +void __weak watchdog_hardlockup_start(void) { } /** * lockup_detector_update_enable - Update the sysctl enable bit * - * Caller needs to make sure that the NMI/perf watchdogs are off, so this - * can't race with watchdog_nmi_disable(). + * Caller needs to make sure that the hard watchdogs are off, so this + * can't race with watchdog_hardlockup_disable(). */ static void lockup_detector_update_enable(void) { watchdog_enabled = 0; if (!watchdog_user_enabled) return; - if (nmi_watchdog_available && nmi_watchdog_user_enabled) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - if (soft_watchdog_user_enabled) - watchdog_enabled |= SOFT_WATCHDOG_ENABLED; + if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled) + watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED; + if (watchdog_softlockup_user_enabled) + watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED; } #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -288,7 +288,7 @@ __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { - soft_watchdog_user_enabled = 0; + watchdog_softlockup_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -402,7 +402,7 @@ static int is_softlockup(unsigned long touch_ts, unsigned long period_ts, unsigned long now) { - if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ + if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) { /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; @@ -537,7 +537,7 @@ static void watchdog_enable(unsigned int cpu) complete(done); /* - * Start the timer first to prevent the NMI watchdog triggering + * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); @@ -547,9 +547,9 @@ static void watchdog_enable(unsigned int cpu) /* Initialize timestamp */ update_touch_ts(); - /* Enable the perf event */ - if (watchdog_enabled & NMI_WATCHDOG_ENABLED) - watchdog_nmi_enable(cpu); + /* Enable the hardlockup detector */ + if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED) + watchdog_hardlockup_enable(cpu); } static void watchdog_disable(unsigned int cpu) @@ -559,11 +559,11 @@ static void watchdog_disable(unsigned int cpu) WARN_ON_ONCE(cpu != smp_processor_id()); /* - * Disable the perf event first. That prevents that a large delay - * between disabling the timer and disabling the perf event causes - * the perf NMI to detect a false positive. + * Disable the hardlockup detector first. That prevents that a large + * delay between disabling the timer and disabling the hardlockup + * detector causes a false positive. */ - watchdog_nmi_disable(cpu); + watchdog_hardlockup_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); } @@ -619,7 +619,7 @@ int lockup_detector_offline_cpu(unsigned int cpu) static void __lockup_detector_reconfigure(void) { cpus_read_lock(); - watchdog_nmi_stop(); + watchdog_hardlockup_stop(); softlockup_stop_all(); set_sample_period(); @@ -627,7 +627,7 @@ static void __lockup_detector_reconfigure(void) if (watchdog_enabled && watchdog_thresh) softlockup_start_all(); - watchdog_nmi_start(); + watchdog_hardlockup_start(); cpus_read_unlock(); /* * Must be called outside the cpus locked section to prevent @@ -668,9 +668,9 @@ static __init void lockup_detector_setup(void) static void __lockup_detector_reconfigure(void) { cpus_read_lock(); - watchdog_nmi_stop(); + watchdog_hardlockup_stop(); lockup_detector_update_enable(); - watchdog_nmi_start(); + watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) @@ -725,14 +725,14 @@ static void proc_watchdog_update(void) /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * - * caller | table->data points to | 'which' - * -------------------|----------------------------|-------------------------- - * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED | - * | | SOFT_WATCHDOG_ENABLED - * -------------------|----------------------------|-------------------------- - * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED - * -------------------|----------------------------|-------------------------- - * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED + * caller | table->data points to | 'which' + * -------------------|----------------------------------|------------------------------- + * proc_watchdog | watchdog_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED | + * | | WATCHDOG_SOFTOCKUP_ENABLED + * -------------------|----------------------------------|------------------------------- + * proc_nmi_watchdog | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED + * -------------------|----------------------------------|------------------------------- + * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -764,7 +764,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int proc_watchdog(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, + return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | + WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } @@ -774,9 +775,9 @@ int proc_watchdog(struct ctl_table *table, int write, int proc_nmi_watchdog(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - if (!nmi_watchdog_available && write) + if (!watchdog_hardlockup_available && write) return -ENOTSUPP; - return proc_watchdog_common(NMI_WATCHDOG_ENABLED, + return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED, table, write, buffer, lenp, ppos); } @@ -786,7 +787,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, int proc_soft_watchdog(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, + return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } @@ -854,7 +855,7 @@ static struct ctl_table watchdog_sysctls[] = { }, { .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, + .data = &watchdog_hardlockup_user_enabled, .maxlen = sizeof(int), .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, @@ -871,7 +872,7 @@ static struct ctl_table watchdog_sysctls[] = { #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, + .data = &watchdog_softlockup_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_soft_watchdog, @@ -940,8 +941,8 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, housekeeping_cpumask(HK_TYPE_TIMER)); - if (!watchdog_nmi_probe()) - nmi_watchdog_available = true; + if (!watchdog_hardlockup_probe()) + watchdog_hardlockup_available = true; lockup_detector_setup(); watchdog_sysctl_init(); } diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 547917ebd5d3..9e6042a892b3 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -215,7 +215,7 @@ void __init hardlockup_detector_perf_restart(void) lockdep_assert_cpus_held(); - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)) return; for_each_online_cpu(cpu) { -- cgit v1.2.3 From d9b3629ade8ebffb0075e311409796a56bac8282 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:37 -0700 Subject: watchdog/hardlockup: have the perf hardlockup use __weak functions more cleanly The fact that there watchdog_hardlockup_enable(), watchdog_hardlockup_disable(), and watchdog_hardlockup_probe() are declared __weak means that the configured hardlockup detector can define non-weak versions of those functions if it needs to. Instead of doing this, the perf hardlockup detector hooked itself into the default __weak implementation, which was a bit awkward. Clean this up. From comments, it looks as if the original design was done because the __weak function were expected to implemented by the architecture and not by the configured hardlockup detector. This got awkward when we tried to add the buddy lockup detector which was not arch-specific but wanted to hook into those same functions. This is not expected to have any functional impact. Link: https://lkml.kernel.org/r/20230519101840.v5.13.I847d9ec852449350997ba00401d2462a9cb4302b@changeid Signed-off-by: Douglas Anderson Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 30 ++++++++++++++++++------------ kernel/watchdog_perf.c | 20 ++++++++++++++------ 2 files changed, 32 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c6790c7dc08d..e67125f64719 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -187,27 +187,33 @@ static inline void watchdog_hardlockup_kick(void) { } #endif /* !CONFIG_HARDLOCKUP_DETECTOR_PERF */ /* - * These functions can be overridden if an architecture implements its - * own hardlockup detector. + * These functions can be overridden based on the configured hardlockdup detector. * * watchdog_hardlockup_enable/disable can be implemented to start and stop when - * softlockup watchdog start and stop. The arch must select the + * softlockup watchdog start and stop. The detector must select the * SOFTLOCKUP_DETECTOR Kconfig. */ -void __weak watchdog_hardlockup_enable(unsigned int cpu) -{ - hardlockup_detector_perf_enable(); -} +void __weak watchdog_hardlockup_enable(unsigned int cpu) { } -void __weak watchdog_hardlockup_disable(unsigned int cpu) -{ - hardlockup_detector_perf_disable(); -} +void __weak watchdog_hardlockup_disable(unsigned int cpu) { } /* Return 0, if a hardlockup watchdog is available. Error code otherwise */ int __weak __init watchdog_hardlockup_probe(void) { - return hardlockup_detector_perf_init(); + /* + * If CONFIG_HAVE_NMI_WATCHDOG is defined then an architecture + * is assumed to have the hard watchdog available and we return 0. + */ + if (IS_ENABLED(CONFIG_HAVE_NMI_WATCHDOG)) + return 0; + + /* + * Hardlockup detectors other than those using CONFIG_HAVE_NMI_WATCHDOG + * are required to implement a non-weak version of this probe function + * to tell whether they are available. If they don't override then + * we'll return -ENODEV. + */ + return -ENODEV; } /** diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 9e6042a892b3..349fcd4d2abc 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -132,10 +132,14 @@ static int hardlockup_detector_event_create(void) } /** - * hardlockup_detector_perf_enable - Enable the local event + * watchdog_hardlockup_enable - Enable the local event + * + * @cpu: The CPU to enable hard lockup on. */ -void hardlockup_detector_perf_enable(void) +void watchdog_hardlockup_enable(unsigned int cpu) { + WARN_ON_ONCE(cpu != smp_processor_id()); + if (hardlockup_detector_event_create()) return; @@ -147,12 +151,16 @@ void hardlockup_detector_perf_enable(void) } /** - * hardlockup_detector_perf_disable - Disable the local event + * watchdog_hardlockup_disable - Disable the local event + * + * @cpu: The CPU to enable hard lockup on. */ -void hardlockup_detector_perf_disable(void) +void watchdog_hardlockup_disable(unsigned int cpu) { struct perf_event *event = this_cpu_read(watchdog_ev); + WARN_ON_ONCE(cpu != smp_processor_id()); + if (event) { perf_event_disable(event); this_cpu_write(watchdog_ev, NULL); @@ -227,9 +235,9 @@ void __init hardlockup_detector_perf_restart(void) } /** - * hardlockup_detector_perf_init - Probe whether NMI event is available at all + * watchdog_hardlockup_probe - Probe whether NMI event is available at all */ -int __init hardlockup_detector_perf_init(void) +int __init watchdog_hardlockup_probe(void) { int ret = hardlockup_detector_event_create(); -- cgit v1.2.3 From 1f423c905a6b43b493df1b259e6e6267e5624e62 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:38 -0700 Subject: watchdog/hardlockup: detect hard lockups using secondary (buddy) CPUs Implement a hardlockup detector that doesn't doesn't need any extra arch-specific support code to detect lockups. Instead of using something arch-specific we will use the buddy system, where each CPU watches out for another one. Specifically, each CPU will use its softlockup hrtimer to check that the next CPU is processing hrtimer interrupts by verifying that a counter is increasing. NOTE: unlike the other hard lockup detectors, the buddy one can't easily show what's happening on the CPU that locked up just by doing a simple backtrace. It relies on some other mechanism in the system to get information about the locked up CPUs. This could be support for NMI backtraces like [1], it could be a mechanism for printing the PC of locked CPUs at panic time like [2] / [3], or it could be something else. Even though that means we still rely on arch-specific code, this arch-specific code seems to often be implemented even on architectures that don't have a hardlockup detector. This style of hardlockup detector originated in some downstream Android trees and has been rebased on / carried in ChromeOS trees for quite a long time for use on arm and arm64 boards. Historically on these boards we've leveraged mechanism [2] / [3] to get information about hung CPUs, but we could move to [1]. Although the original motivation for the buddy system was for use on systems without an arch-specific hardlockup detector, it can still be useful to use even on systems that _do_ have an arch-specific hardlockup detector. On x86, for instance, there is a 24-part patch series [4] in progress switching the arch-specific hard lockup detector from a scarce perf counter to a less-scarce hardware resource. Potentially the buddy system could be a simpler alternative to free up the perf counter but still get hard lockup detection. Overall, pros (+) and cons (-) of the buddy system compared to an arch-specific hardlockup detector (which might be implemented using perf): + The buddy system is usable on systems that don't have an arch-specific hardlockup detector, like arm32 and arm64 (though it's being worked on for arm64 [5]). + The buddy system may free up scarce hardware resources. + If a CPU totally goes out to lunch (can't process NMIs) the buddy system could still detect the problem (though it would be unlikely to be able to get a stack trace). + The buddy system uses the same timer function to pet the hardlockup detector on the running CPU as it uses to detect hardlockups on other CPUs. Compared to other hardlockup detectors, this means it generates fewer interrupts and thus is likely better able to let CPUs stay idle longer. - If all CPUs are hard locked up at the same time the buddy system can't detect it. - If we don't have SMP we can't use the buddy system. - The buddy system needs an arch-specific mechanism (possibly NMI backtrace) to get info about the locked up CPU. [1] https://lore.kernel.org/r/20230419225604.21204-1-dianders@chromium.org [2] https://issuetracker.google.com/172213129 [3] https://docs.kernel.org/trace/coresight/coresight-cpu-debug.html [4] https://lore.kernel.org/lkml/20230301234753.28582-1-ricardo.neri-calderon@linux.intel.com/ [5] https://lore.kernel.org/linux-arm-kernel/20220903093415.15850-1-lecopzer.chen@mediatek.com/ Link: https://lkml.kernel.org/r/20230519101840.v5.14.I6bf789d21d0c3d75d382e7e51a804a7a51315f2c@changeid Signed-off-by: Colin Cross Signed-off-by: Matthias Kaehlcke Signed-off-by: Guenter Roeck Signed-off-by: Tzung-Bi Shih Signed-off-by: Douglas Anderson Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Daniel Thompson Cc: "David S. Miller" Cc: Ian Rogers Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Petr Mladek Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/Makefile | 1 + kernel/watchdog.c | 29 +++++++++++---- kernel/watchdog_buddy.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 7 deletions(-) create mode 100644 kernel/watchdog_buddy.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 7eb72033143c..f9e3fd9195d9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,6 +91,7 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY) += watchdog_buddy.o obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e67125f64719..10947c835079 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -85,7 +85,7 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) +#if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER) static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); @@ -106,6 +106,14 @@ notrace void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); +void watchdog_hardlockup_touch_cpu(unsigned int cpu) +{ + per_cpu(watchdog_hardlockup_touched, cpu) = true; + + /* Match with smp_rmb() in watchdog_hardlockup_check() */ + smp_wmb(); +} + static bool is_hardlockup(unsigned int cpu) { int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); @@ -123,13 +131,16 @@ static bool is_hardlockup(unsigned int cpu) return false; } -static void watchdog_hardlockup_kick(void) +static unsigned long watchdog_hardlockup_kick(void) { - atomic_inc(raw_cpu_ptr(&hrtimer_interrupts)); + return atomic_inc_return(raw_cpu_ptr(&hrtimer_interrupts)); } void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { + /* Match with smp_wmb() in watchdog_hardlockup_touch_cpu() */ + smp_rmb(); + if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; @@ -180,11 +191,11 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) } } -#else /* CONFIG_HARDLOCKUP_DETECTOR_PERF */ +#else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ -static inline void watchdog_hardlockup_kick(void) { } +static inline unsigned long watchdog_hardlockup_kick(void) { return 0; } -#endif /* !CONFIG_HARDLOCKUP_DETECTOR_PERF */ +#endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ /* * These functions can be overridden based on the configured hardlockdup detector. @@ -443,11 +454,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + unsigned long hrtimer_interrupts; if (!watchdog_enabled) return HRTIMER_NORESTART; - watchdog_hardlockup_kick(); + hrtimer_interrupts = watchdog_hardlockup_kick(); + + /* test for hardlockups */ + watchdog_buddy_check_hardlockup(hrtimer_interrupts); /* kick the softlockup detector */ if (completion_done(this_cpu_ptr(&softlockup_completion))) { diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c new file mode 100644 index 000000000000..fee45af2e5bd --- /dev/null +++ b/kernel/watchdog_buddy.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +static cpumask_t __read_mostly watchdog_cpus; + +static unsigned int watchdog_next_cpu(unsigned int cpu) +{ + cpumask_t cpus = watchdog_cpus; + unsigned int next_cpu; + + next_cpu = cpumask_next(cpu, &cpus); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(&cpus); + + if (next_cpu == cpu) + return nr_cpu_ids; + + return next_cpu; +} + +int __init watchdog_hardlockup_probe(void) +{ + return 0; +} + +void watchdog_hardlockup_enable(unsigned int cpu) +{ + unsigned int next_cpu; + + /* + * The new CPU will be marked online before the hrtimer interrupt + * gets a chance to run on it. If another CPU tests for a + * hardlockup on the new CPU before it has run its the hrtimer + * interrupt, it will get a false positive. Touch the watchdog on + * the new CPU to delay the check for at least 3 sampling periods + * to guarantee one hrtimer has run on the new CPU. + */ + watchdog_hardlockup_touch_cpu(cpu); + + /* + * We are going to check the next CPU. Our watchdog_hrtimer + * need not be zero if the CPU has already been online earlier. + * Touch the watchdog on the next CPU to avoid false positive + * if we try to check it in less then 3 interrupts. + */ + next_cpu = watchdog_next_cpu(cpu); + if (next_cpu < nr_cpu_ids) + watchdog_hardlockup_touch_cpu(next_cpu); + + cpumask_set_cpu(cpu, &watchdog_cpus); +} + +void watchdog_hardlockup_disable(unsigned int cpu) +{ + unsigned int next_cpu = watchdog_next_cpu(cpu); + + /* + * Offlining this CPU will cause the CPU before this one to start + * checking the one after this one. If this CPU just finished checking + * the next CPU and updating hrtimer_interrupts_saved, and then the + * previous CPU checks it within one sample period, it will trigger a + * false positive. Touch the watchdog on the next CPU to prevent it. + */ + if (next_cpu < nr_cpu_ids) + watchdog_hardlockup_touch_cpu(next_cpu); + + cpumask_clear_cpu(cpu, &watchdog_cpus); +} + +void watchdog_buddy_check_hardlockup(unsigned long hrtimer_interrupts) +{ + unsigned int next_cpu; + + /* + * Test for hardlockups every 3 samples. The sample period is + * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over + * watchdog_thresh (over by 20%). + */ + if (hrtimer_interrupts % 3 != 0) + return; + + /* check for a hardlockup on the next CPU */ + next_cpu = watchdog_next_cpu(smp_processor_id()); + if (next_cpu >= nr_cpu_ids) + return; + + watchdog_hardlockup_check(next_cpu, NULL); +} -- cgit v1.2.3 From b17aa959330e8058452297049a0056ba4b9c72e8 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 May 2023 10:18:39 -0700 Subject: watchdog/perf: add a weak function for an arch to detect if perf can use NMIs On arm64, NMI support needs to be detected at runtime. Add a weak function to the perf hardlockup detector so that an architecture can implement it to detect whether NMIs are available. Link: https://lkml.kernel.org/r/20230519101840.v5.15.Ic55cb6f90ef5967d8aaa2b503a4e67c753f64d3a@changeid Signed-off-by: Douglas Anderson Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Lecopzer Chen Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Petr Mladek Cc: Pingfan Liu Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog_perf.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 349fcd4d2abc..8ea00c4a24b2 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -234,12 +234,22 @@ void __init hardlockup_detector_perf_restart(void) } } +bool __weak __init arch_perf_nmi_is_available(void) +{ + return true; +} + /** * watchdog_hardlockup_probe - Probe whether NMI event is available at all */ int __init watchdog_hardlockup_probe(void) { - int ret = hardlockup_detector_event_create(); + int ret; + + if (!arch_perf_nmi_is_available()) + return -ENODEV; + + ret = hardlockup_detector_event_create(); if (ret) { pr_info("Perf NMI watchdog permanently disabled\n"); -- cgit v1.2.3 From 930d8f8dbab97cb05dba30e67a2dfa0c6dbf4bc7 Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Fri, 19 May 2023 10:18:40 -0700 Subject: watchdog/perf: adapt the watchdog_perf interface for async model When lockup_detector_init()->watchdog_hardlockup_probe(), PMU may be not ready yet. E.g. on arm64, PMU is not ready until device_initcall(armv8_pmu_driver_init). And it is deeply integrated with the driver model and cpuhp. Hence it is hard to push this initialization before smp_init(). But it is easy to take an opposite approach and try to initialize the watchdog once again later. The delayed probe is called using workqueues. It need to allocate memory and must be proceed in a normal context. The delayed probe is able to use if watchdog_hardlockup_probe() returns non-zero which means the return code returned when PMU is not ready yet. Provide an API - lockup_detector_retry_init() for anyone who needs to delayed init lockup detector if they had ever failed at lockup_detector_init(). The original assumption is: nobody should use delayed probe after lockup_detector_check() which has __init attribute. That is, anyone uses this API must call between lockup_detector_init() and lockup_detector_check(), and the caller must have __init attribute Link: https://lkml.kernel.org/r/20230519101840.v5.16.If4ad5dd5d09fb1309cebf8bcead4b6a5a7758ca7@changeid Reviewed-by: Petr Mladek Co-developed-by: Pingfan Liu Signed-off-by: Pingfan Liu Signed-off-by: Lecopzer Chen Signed-off-by: Douglas Anderson Suggested-by: Petr Mladek Cc: Andi Kleen Cc: Catalin Marinas Cc: Chen-Yu Tsai Cc: Christophe Leroy Cc: Colin Cross Cc: Daniel Thompson Cc: "David S. Miller" Cc: Guenter Roeck Cc: Ian Rogers Cc: Marc Zyngier Cc: Mark Rutland Cc: Masayoshi Mizuma Cc: Matthias Kaehlcke Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Randy Dunlap Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Stephane Eranian Cc: Stephen Boyd Cc: Sumit Garg Cc: Tzung-Bi Shih Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/watchdog.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 10947c835079..237990e8d345 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -208,7 +208,13 @@ void __weak watchdog_hardlockup_enable(unsigned int cpu) { } void __weak watchdog_hardlockup_disable(unsigned int cpu) { } -/* Return 0, if a hardlockup watchdog is available. Error code otherwise */ +/* + * Watchdog-detector specific API. + * + * Return 0 when hardlockup watchdog is available, negative value otherwise. + * Note that the negative value means that a delayed probe might + * succeed later. + */ int __weak __init watchdog_hardlockup_probe(void) { /* @@ -954,6 +960,62 @@ static void __init watchdog_sysctl_init(void) #define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ +static void __init lockup_detector_delay_init(struct work_struct *work); +static bool allow_lockup_detector_init_retry __initdata; + +static struct work_struct detector_work __initdata = + __WORK_INITIALIZER(detector_work, lockup_detector_delay_init); + +static void __init lockup_detector_delay_init(struct work_struct *work) +{ + int ret; + + ret = watchdog_hardlockup_probe(); + if (ret) { + pr_info("Delayed init of the lockup detector failed: %d\n", ret); + pr_info("Hard watchdog permanently disabled\n"); + return; + } + + allow_lockup_detector_init_retry = false; + + watchdog_hardlockup_available = true; + lockup_detector_setup(); +} + +/* + * lockup_detector_retry_init - retry init lockup detector if possible. + * + * Retry hardlockup detector init. It is useful when it requires some + * functionality that has to be initialized later on a particular + * platform. + */ +void __init lockup_detector_retry_init(void) +{ + /* Must be called before late init calls */ + if (!allow_lockup_detector_init_retry) + return; + + schedule_work(&detector_work); +} + +/* + * Ensure that optional delayed hardlockup init is proceed before + * the init code and memory is freed. + */ +static int __init lockup_detector_check(void) +{ + /* Prevent any later retry. */ + allow_lockup_detector_init_retry = false; + + /* Make sure no work is pending. */ + flush_work(&detector_work); + + return 0; + +} +late_initcall_sync(lockup_detector_check); + void __init lockup_detector_init(void) { if (tick_nohz_full_enabled()) @@ -964,6 +1026,9 @@ void __init lockup_detector_init(void) if (!watchdog_hardlockup_probe()) watchdog_hardlockup_available = true; + else + allow_lockup_detector_init_retry = true; + lockup_detector_setup(); watchdog_sysctl_init(); } -- cgit v1.2.3 From 4df3504e2f17cb35d2c12b07e716ae37971d0d52 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 25 May 2023 16:26:25 +0200 Subject: kexec: avoid calculating array size twice Avoid calculating array size twice in kexec_purgatory_setup_sechdrs(). Once using array_size(), and once open-coded. Flagged by Coccinelle: .../kexec_file.c:881:8-25: WARNING: array_size is already used (line 877) to compute the same size No functional change intended. Compile tested only. Link: https://lkml.kernel.org/r/20230525-kexec-array_size-v1-1-8b4bf4f7500a@kernel.org Signed-off-by: Simon Horman Acked-by: Baoquan He Cc: Eric W. Biederman Cc: Zhen Lei Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f989f5f1933b..3f5677679744 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -867,6 +867,7 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, { unsigned long bss_addr; unsigned long offset; + size_t sechdrs_size; Elf_Shdr *sechdrs; int i; @@ -874,11 +875,11 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, * The section headers in kexec_purgatory are read-only. In order to * have them modifiable make a temporary copy. */ - sechdrs = vzalloc(array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum)); + sechdrs_size = array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum); + sechdrs = vzalloc(sechdrs_size); if (!sechdrs) return -ENOMEM; - memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, - pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, sechdrs_size); pi->sechdrs = sechdrs; offset = 0; -- cgit v1.2.3 From 1cba6c4309f03de570202c46f03df3f73a0d4c82 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 27 May 2023 20:34:34 +0800 Subject: kexec: fix a memory leak in crash_shrink_memory() Patch series "kexec: enable kexec_crash_size to support two crash kernel regions". When crashkernel=X fails to reserve region under 4G, it will fall back to reserve region above 4G and a region of the default size will also be reserved under 4G. Unfortunately, /sys/kernel/kexec_crash_size only supports one crash kernel region now, the user cannot sense the low memory reserved by reading /sys/kernel/kexec_crash_size. Also, low memory cannot be freed by writing this file. For example: resource_size(crashk_res) = 512M resource_size(crashk_low_res) = 256M The result of 'cat /sys/kernel/kexec_crash_size' is 512M, but it should be 768M. When we execute 'echo 0 > /sys/kernel/kexec_crash_size', the size of crashk_res becomes 0 and resource_size(crashk_low_res) is still 256 MB, which is incorrect. Since crashk_res manages the memory with high address and crashk_low_res manages the memory with low address, crashk_low_res is shrunken only when all crashk_res is shrunken. And because when there is only one crash kernel region, crashk_res is always used. Therefore, if all crashk_res is shrunken and crashk_low_res still exists, swap them. This patch (of 6): If the value of parameter 'new_size' is in the semi-open and semi-closed interval (crashk_res.end - KEXEC_CRASH_MEM_ALIGN + 1, crashk_res.end], the calculation result of ram_res is: ram_res->start = crashk_res.end + 1 ram_res->end = crashk_res.end The operation of insert_resource() fails, and ram_res is not added to iomem_resource. As a result, the memory of the control block ram_res is leaked. In fact, on all architectures, the start address and size of crashk_res are already aligned by KEXEC_CRASH_MEM_ALIGN. Therefore, we do not need to round up crashk_res.start again. Instead, we should round up 'new_size' in advance. Link: https://lkml.kernel.org/r/20230527123439.772-1-thunder.leizhen@huawei.com Link: https://lkml.kernel.org/r/20230527123439.772-2-thunder.leizhen@huawei.com Fixes: 6480e5a09237 ("kdump: add missing RAM resource in crash_shrink_memory()") Fixes: 06a7f711246b ("kexec: premit reduction of the reserved memory size") Signed-off-by: Zhen Lei Acked-by: Baoquan He Cc: Cong Wang Cc: Eric W. Biederman Cc: Michael Holzheu Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3d578c6fefee..22acee18195a 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1122,6 +1122,7 @@ int crash_shrink_memory(unsigned long new_size) start = crashk_res.start; end = crashk_res.end; old_size = (end == 0) ? 0 : end - start + 1; + new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); if (new_size >= old_size) { ret = (new_size == old_size) ? 0 : -EINVAL; goto unlock; @@ -1133,9 +1134,7 @@ int crash_shrink_memory(unsigned long new_size) goto unlock; } - start = roundup(start, KEXEC_CRASH_MEM_ALIGN); - end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - + end = start + new_size; crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) -- cgit v1.2.3 From 6f22a744f4ee7a22be4704cf93bbe22decc7e79e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 27 May 2023 20:34:35 +0800 Subject: kexec: delete a useless check in crash_shrink_memory() The check '(crashk_res.parent != NULL)' is added by commit e05bd3367bd3 ("kexec: fix Oops in crash_shrink_memory()"), but it's stale now. Because if 'crashk_res' is not reserved, it will be zero in size and will be intercepted by the above 'if (new_size >= old_size)'. Ago: if (new_size >= end - start + 1) Now: old_size = (end == 0) ? 0 : end - start + 1; if (new_size >= old_size) Link: https://lkml.kernel.org/r/20230527123439.772-3-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Cc: Baoquan He Cc: Cong Wang Cc: Eric W. Biederman Cc: Michael Holzheu Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 22acee18195a..d1ab139dd490 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1137,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size) end = start + new_size; crash_free_reserved_phys_range(end, crashk_res.end); - if ((start == end) && (crashk_res.parent != NULL)) + if (start == end) release_resource(&crashk_res); ram_res->start = end; -- cgit v1.2.3 From f7f567b95b12eda7a6a273b8cb82d152491bc0da Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 27 May 2023 20:34:36 +0800 Subject: kexec: clear crashk_res if all its memory has been released If the resource of crashk_res has been released, it is better to clear crashk_res.start and crashk_res.end. Because 'end = start - 1' is not reasonable, and in some places the test is based on crashk_res.end, not resource_size(&crashk_res). Link: https://lkml.kernel.org/r/20230527123439.772-4-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Acked-by: Baoquan He Cc: Cong Wang Cc: Eric W. Biederman Cc: Michael Holzheu Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d1ab139dd490..bcc86a250ab3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1137,15 +1137,18 @@ int crash_shrink_memory(unsigned long new_size) end = start + new_size; crash_free_reserved_phys_range(end, crashk_res.end); - if (start == end) - release_resource(&crashk_res); - ram_res->start = end; ram_res->end = crashk_res.end; ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; ram_res->name = "System RAM"; - crashk_res.end = end - 1; + if (start == end) { + release_resource(&crashk_res); + crashk_res.start = 0; + crashk_res.end = 0; + } else { + crashk_res.end = end - 1; + } insert_resource(&iomem_resource, ram_res); -- cgit v1.2.3 From 8a7db7790a3ff8413bedef886cf135ba301e88d7 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 27 May 2023 20:34:37 +0800 Subject: kexec: improve the readability of crash_shrink_memory() The major adjustments are: 1. end = start + new_size. The 'end' here is not an accurate representation, because it is not the new end of crashk_res, but the start of ram_res, difference 1. So eliminate it and replace it with ram_res->start. 2. Use 'ram_res->start' and 'ram_res->end' as arguments to crash_free_reserved_phys_range() to indicate that the memory covered by 'ram_res' is released from the crashk. And keep it close to insert_resource(). 3. Replace 'if (start == end)' with 'if (!new_size)', clear indication that all crashk memory will be shrunken. No functional change. Link: https://lkml.kernel.org/r/20230527123439.772-5-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Acked-by: Baoquan He Cc: Cong Wang Cc: Eric W. Biederman Cc: Michael Holzheu Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bcc86a250ab3..69fe92141b0b 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1108,7 +1108,6 @@ ssize_t crash_get_memory_size(void) int crash_shrink_memory(unsigned long new_size) { int ret = 0; - unsigned long start, end; unsigned long old_size; struct resource *ram_res; @@ -1119,9 +1118,7 @@ int crash_shrink_memory(unsigned long new_size) ret = -ENOENT; goto unlock; } - start = crashk_res.start; - end = crashk_res.end; - old_size = (end == 0) ? 0 : end - start + 1; + old_size = !crashk_res.end ? 0 : resource_size(&crashk_res); new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); if (new_size >= old_size) { ret = (new_size == old_size) ? 0 : -EINVAL; @@ -1134,22 +1131,20 @@ int crash_shrink_memory(unsigned long new_size) goto unlock; } - end = start + new_size; - crash_free_reserved_phys_range(end, crashk_res.end); - - ram_res->start = end; + ram_res->start = crashk_res.start + new_size; ram_res->end = crashk_res.end; ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; ram_res->name = "System RAM"; - if (start == end) { + if (!new_size) { release_resource(&crashk_res); crashk_res.start = 0; crashk_res.end = 0; } else { - crashk_res.end = end - 1; + crashk_res.end = ram_res->start - 1; } + crash_free_reserved_phys_range(ram_res->start, ram_res->end); insert_resource(&iomem_resource, ram_res); unlock: -- cgit v1.2.3 From 5b7bfb32cbaad6ba997ab4295ee50bc9921252f2 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 27 May 2023 20:34:38 +0800 Subject: kexec: add helper __crash_shrink_memory() No functional change, in preparation for the next patch so that it is easier to review. [akpm@linux-foundation.org: make __crash_shrink_memory() static] Link: https://lore.kernel.org/oe-kbuild-all/202305280717.Pw06aLkz-lkp@intel.com/ Link: https://lkml.kernel.org/r/20230527123439.772-6-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Acked-by: Baoquan He Cc: Cong Wang Cc: Eric W. Biederman Cc: Michael Holzheu Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 69fe92141b0b..f7ff7de94a61 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1105,11 +1105,38 @@ ssize_t crash_get_memory_size(void) return size; } +static int __crash_shrink_memory(struct resource *old_res, + unsigned long new_size) +{ + struct resource *ram_res; + + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) + return -ENOMEM; + + ram_res->start = old_res->start + new_size; + ram_res->end = old_res->end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + ram_res->name = "System RAM"; + + if (!new_size) { + release_resource(old_res); + old_res->start = 0; + old_res->end = 0; + } else { + crashk_res.end = ram_res->start - 1; + } + + crash_free_reserved_phys_range(ram_res->start, ram_res->end); + insert_resource(&iomem_resource, ram_res); + + return 0; +} + int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long old_size; - struct resource *ram_res; if (!kexec_trylock()) return -EBUSY; @@ -1125,27 +1152,7 @@ int crash_shrink_memory(unsigned long new_size) goto unlock; } - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) { - ret = -ENOMEM; - goto unlock; - } - - ram_res->start = crashk_res.start + new_size; - ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; - ram_res->name = "System RAM"; - - if (!new_size) { - release_resource(&crashk_res); - crashk_res.start = 0; - crashk_res.end = 0; - } else { - crashk_res.end = ram_res->start - 1; - } - - crash_free_reserved_phys_range(ram_res->start, ram_res->end); - insert_resource(&iomem_resource, ram_res); + ret = __crash_shrink_memory(&crashk_res, new_size); unlock: kexec_unlock(); -- cgit v1.2.3 From 16c6006af4d4e70ecef93977a5314409d931020b Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 27 May 2023 20:34:39 +0800 Subject: kexec: enable kexec_crash_size to support two crash kernel regions The crashk_low_res should be considered by /sys/kernel/kexec_crash_size to support two crash kernel regions shrinking if existing. While doing it, crashk_low_res will only be shrunk when the entire crashk_res is empty; and if the crashk_res is empty and crahk_low_res is not, change crashk_low_res to be crashk_res. [bhe@redhat.com: redo changelog] Link: https://lkml.kernel.org/r/20230527123439.772-7-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Acked-by: Baoquan He Cc: Cong Wang Cc: Eric W. Biederman Cc: Michael Holzheu Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index f7ff7de94a61..e2f2574d8b74 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1091,6 +1091,11 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) } } +static inline resource_size_t crash_resource_size(const struct resource *res) +{ + return !res->end ? 0 : resource_size(res); +} + ssize_t crash_get_memory_size(void) { ssize_t size = 0; @@ -1098,8 +1103,8 @@ ssize_t crash_get_memory_size(void) if (!kexec_trylock()) return -EBUSY; - if (crashk_res.end != crashk_res.start) - size = resource_size(&crashk_res); + size += crash_resource_size(&crashk_res); + size += crash_resource_size(&crashk_low_res); kexec_unlock(); return size; @@ -1136,7 +1141,7 @@ static int __crash_shrink_memory(struct resource *old_res, int crash_shrink_memory(unsigned long new_size) { int ret = 0; - unsigned long old_size; + unsigned long old_size, low_size; if (!kexec_trylock()) return -EBUSY; @@ -1145,14 +1150,42 @@ int crash_shrink_memory(unsigned long new_size) ret = -ENOENT; goto unlock; } - old_size = !crashk_res.end ? 0 : resource_size(&crashk_res); + + low_size = crash_resource_size(&crashk_low_res); + old_size = crash_resource_size(&crashk_res) + low_size; new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); if (new_size >= old_size) { ret = (new_size == old_size) ? 0 : -EINVAL; goto unlock; } - ret = __crash_shrink_memory(&crashk_res, new_size); + /* + * (low_size > new_size) implies that low_size is greater than zero. + * This also means that if low_size is zero, the else branch is taken. + * + * If low_size is greater than 0, (low_size > new_size) indicates that + * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res + * needs to be shrunken. + */ + if (low_size > new_size) { + ret = __crash_shrink_memory(&crashk_res, 0); + if (ret) + goto unlock; + + ret = __crash_shrink_memory(&crashk_low_res, new_size); + } else { + ret = __crash_shrink_memory(&crashk_res, new_size - low_size); + } + + /* Swap crashk_res and crashk_low_res if needed */ + if (!crashk_res.end && crashk_low_res.end) { + crashk_res.start = crashk_low_res.start; + crashk_res.end = crashk_low_res.end; + release_resource(&crashk_low_res); + crashk_low_res.start = 0; + crashk_low_res.end = 0; + insert_resource(&iomem_resource, &crashk_res); + } unlock: kexec_unlock(); -- cgit v1.2.3 From a94181ec064b3ad1b6f573f5953e2011f8c90292 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 7 Jun 2023 16:28:45 +0200 Subject: syscalls: add sys_ni_posix_timers prototype The sys_ni_posix_timers() definition causes a warning when the declaration is missing, so this needs to be added along with the normal syscalls, outside of the #ifdef. kernel/time/posix-stubs.c:26:17: error: no previous prototype for 'sys_ni_posix_timers' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20230607142925.3126422-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Kees Cook Signed-off-by: Andrew Morton --- kernel/time/posix-stubs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 828aeecbd1e8..39769b2d1005 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER /* Architectures may override SYS_NI and COMPAT_SYS_NI */ -- cgit v1.2.3 From c889d0793d9dc07e94a5fddcc05356157fab00b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:39 +0200 Subject: swsusp: don't pass a stack address to blkdev_get_by_path holder is just an on-stack pointer that can easily be reused by other calls, replace it with a static variable that doesn't change. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230608110258.189493-12-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 81aec3b2c605..b03ff1a33c7f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1510,6 +1510,8 @@ end: return error; } +static void *swsusp_holder; + /** * swsusp_check - Check for swsusp signature in the resume device */ @@ -1517,14 +1519,13 @@ end: int swsusp_check(bool snapshot_test) { int error; - void *holder; fmode_t mode = FMODE_READ; if (snapshot_test) mode |= FMODE_EXCL; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - mode, &holder, NULL); + mode, &swsusp_holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); -- cgit v1.2.3 From 2736e8eeb0ccdc71d1f4256c9c9a28f58cc43307 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:43 +0200 Subject: block: use the holder as indication for exclusive opens The current interface for exclusive opens is rather confusing as it requires both the FMODE_EXCL flag and a holder. Remove the need to pass FMODE_EXCL and just key off the exclusive open off a non-NULL holder. For blkdev_put this requires adding the holder argument, which provides better debug checking that only the holder actually releases the hold, but at the same time allows removing the now superfluous mode argument. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: David Sterba [btrfs] Acked-by: Jack Wang [rnbd] Link: https://lore.kernel.org/r/20230608110258.189493-16-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 12 ++++-------- kernel/power/power.h | 2 +- kernel/power/swap.c | 21 +++++++++------------ 3 files changed, 14 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 7ae95ec72f99..f62e89d0d906 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -688,22 +688,18 @@ static int load_image_and_restore(bool snapshot_test) { int error; unsigned int flags; - fmode_t mode = FMODE_READ; - - if (snapshot_test) - mode |= FMODE_EXCL; pm_pr_dbg("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { - swsusp_close(mode); + swsusp_close(snapshot_test); goto Unlock; } error = swsusp_read(&flags); - swsusp_close(mode); + swsusp_close(snapshot_test); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -956,7 +952,7 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(false); goto Unlock; } @@ -991,7 +987,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(false); goto Finish; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 978189fcafd1..a8e0c44b804e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -177,7 +177,7 @@ int swsusp_check(bool snapshot_test); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -extern void swsusp_close(fmode_t); +void swsusp_close(bool snapshot_test); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); #endif diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b03ff1a33c7f..cc9259307c94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -363,7 +363,7 @@ static int swsusp_swap_check(void) res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) - blkdev_put(hib_resume_bdev, FMODE_WRITE); + blkdev_put(hib_resume_bdev, NULL); return res; } @@ -443,7 +443,7 @@ static int get_swap_writer(struct swap_map_handle *handle) err_rel: release_swap_writer(handle); err_close: - swsusp_close(FMODE_WRITE); + swsusp_close(false); return ret; } @@ -508,7 +508,7 @@ static int swap_writer_finish(struct swap_map_handle *handle, if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); - swsusp_close(FMODE_WRITE); + swsusp_close(false); return error; } @@ -1518,14 +1518,11 @@ static void *swsusp_holder; int swsusp_check(bool snapshot_test) { + void *holder = snapshot_test ? &swsusp_holder : NULL; int error; - fmode_t mode = FMODE_READ; - if (snapshot_test) - mode |= FMODE_EXCL; - - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - mode, &swsusp_holder, NULL); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_READ, + holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1552,7 +1549,7 @@ int swsusp_check(bool snapshot_test) put: if (error) - blkdev_put(hib_resume_bdev, mode); + blkdev_put(hib_resume_bdev, holder); else pr_debug("Image signature found, resuming\n"); } else { @@ -1569,14 +1566,14 @@ put: * swsusp_close - close swap device. */ -void swsusp_close(fmode_t mode) +void swsusp_close(bool snapshot_test) { if (IS_ERR(hib_resume_bdev)) { pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, mode); + blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL); } /** -- cgit v1.2.3 From 05bdb9965305bbfdae79b31d22df03d1e2cfcb22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:55 +0200 Subject: block: replace fmode_t with a block-specific type for block open flags The only overlap between the block open flags mapped into the fmode_t and other uses of fmode_t are FMODE_READ and FMODE_WRITE. Define a new blk_mode_t instead for use in blkdev_get_by_{dev,path}, ->open and ->ioctl and stop abusing fmode_t. Signed-off-by: Christoph Hellwig Acked-by: Jack Wang [rnbd] Reviewed-by: Hannes Reinecke Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-28-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/swap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index cc9259307c94..f6ebcd00c410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -356,8 +356,8 @@ static int swsusp_swap_check(void) return res; root_swap = res; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, - NULL, NULL); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(hib_resume_bdev)) return PTR_ERR(hib_resume_bdev); @@ -1521,7 +1521,7 @@ int swsusp_check(bool snapshot_test) void *holder = snapshot_test ? &swsusp_holder : NULL; int error; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_READ, + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); -- cgit v1.2.3 From 5273ee254e34cf577460dc8eb786b26a7b5329d8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 10 Jun 2023 14:41:38 +0800 Subject: cgroup/cpuset: remove unneeded header files Remove some unnecessary header files. No functional change intended. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b0aee733b92b..58e6f18f01c1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -37,12 +37,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include -- cgit v1.2.3 From ba49f976885869835a1783863376221dc24f1817 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Jun 2023 15:50:18 +0200 Subject: bpf: Hide unused bpf_patch_call_args This function is only used when CONFIG_BPF_JIT_ALWAYS_ON is disabled, but CONFIG_BPF_SYSCALL is enabled. When both are turned off, the prototype is missing but the unused function is still compiled, as seen from this W=1 warning: [...] kernel/bpf/core.c:2075:6: error: no previous prototype for 'bpf_patch_call_args' [-Werror=missing-prototypes] [...] Add a matching #ifdef for the definition to leave it out. Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20230602135128.1498362-1-arnd@kernel.org --- kernel/bpf/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7421487422d4..dc85240a0134 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2064,14 +2064,16 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), -static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, - const struct bpf_insn *insn) = { +static __maybe_unused +u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST +#ifdef CONFIG_BPF_SYSCALL void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); @@ -2080,7 +2082,7 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } - +#endif #else static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) -- cgit v1.2.3 From d16b3af46679a1eb21652c37711a60d3d4e6b8c0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 10 Jun 2023 11:57:37 +0800 Subject: cgroup: remove unused task_cgroup_path() task_cgroup_path() is not used anymore. So remove it. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 57f31b234433..04d1c0cde882 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2364,45 +2364,6 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, } EXPORT_SYMBOL_GPL(cgroup_path_ns); -/** - * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy - * @task: target task - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Determine @task's cgroup on the first (the one with the lowest non-zero - * hierarchy_id) cgroup hierarchy and copy its path into @buf. This - * function grabs cgroup_mutex and shouldn't be used inside locks used by - * cgroup controller callbacks. - * - * Return value is the same as kernfs_path(). - */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) -{ - struct cgroup_root *root; - struct cgroup *cgrp; - int hierarchy_id = 1; - int ret; - - cgroup_lock(); - spin_lock_irq(&css_set_lock); - - root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); - - if (root) { - cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); - } else { - /* if no hierarchy exists, everyone is in "/" */ - ret = strscpy(buf, "/", buflen); - } - - spin_unlock_irq(&css_set_lock); - cgroup_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(task_cgroup_path); - /** * cgroup_attach_lock - Lock for ->attach() * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem -- cgit v1.2.3 From cdb8c100d8a4b4e31c829724e40b4fdf32977cce Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 2 Jun 2023 02:30:22 -0500 Subject: include/linux/suspend.h: Only show pm_pr_dbg messages at suspend/resume All uses in the kernel are currently already oriented around suspend/resume. As some other parts of the kernel may also use these messages in functions that could also be used outside of suspend/resume, only enable in suspend/resume path. Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 3113ec2f1db4..daa535012e51 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -556,6 +556,12 @@ power_attr_ro(pm_wakeup_irq); bool pm_debug_messages_on __read_mostly; +bool pm_debug_messages_should_print(void) +{ + return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; +} +EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); + static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { -- cgit v1.2.3 From 5ba3a7a851e3ebffc4cb8f052a4581c4d8af3ae3 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 9 Jun 2023 22:50:49 -0500 Subject: bpf: Add bpf_cpumask_first_and() kfunc We currently provide bpf_cpumask_first(), bpf_cpumask_any(), and bpf_cpumask_any_and() kfuncs. bpf_cpumask_any() and bpf_cpumask_any_and() are confusing misnomers in that they actually just call cpumask_first() and cpumask_first_and() respectively. We'll replace them with bpf_cpumask_any_distribute() and bpf_cpumask_any_distribute_and() kfuncs in a subsequent patch, so let's ensure feature parity by adding a bpf_cpumask_first_and() kfunc to account for bpf_cpumask_any_and() being removed. Signed-off-by: David Vernet Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230610035053.117605-1-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 7efdf5d770ca..9416c8ac8a04 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -131,6 +131,21 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) return cpumask_first_zero(cpumask); } +/** + * bpf_cpumask_first_and() - Return the index of the first nonzero bit from the + * AND of two cpumasks. + * @src1: The first cpumask. + * @src2: The second cpumask. + * + * Find the index of the first nonzero bit of the AND of two cpumasks. + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, + const struct cpumask *src2) +{ + return cpumask_first_and(src1, src2); +} + /** * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask. * @cpu: The CPU to be set in the cpumask. @@ -406,6 +421,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU) -- cgit v1.2.3 From f983be917332ea5e03f689e12c6668be48cb4cfe Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 9 Jun 2023 22:50:51 -0500 Subject: bpf: Replace bpf_cpumask_any* with bpf_cpumask_any_distribute* We currently export the bpf_cpumask_any() and bpf_cpumask_any_and() kfuncs. Intuitively, one would expect these to choose any CPU in the cpumask, but what they actually do is alias to cpumask_first() and cpmkas_first_and(). This is useless given that we already export bpf_cpumask_first() and bpf_cpumask_first_and(), so this patch replaces them with kfuncs that call cpumask_any_distribute() and cpumask_any_and_distribute(), which actually choose any CPU from the cpumask (or the AND of two cpumasks for the latter). Signed-off-by: David Vernet Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230610035053.117605-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 9416c8ac8a04..938a60ff4295 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -382,7 +382,7 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask } /** - * bpf_cpumask_any() - Return a random set CPU from a cpumask. + * bpf_cpumask_any_distribute() - Return a random set CPU from a cpumask. * @cpumask: The cpumask being queried. * * Return: @@ -391,26 +391,28 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask * * A struct bpf_cpumask pointer may be safely passed to @src. */ -__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask) +__bpf_kfunc u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) { - return cpumask_any(cpumask); + return cpumask_any_distribute(cpumask); } /** - * bpf_cpumask_any_and() - Return a random set CPU from the AND of two - * cpumasks. + * bpf_cpumask_any_and_distribute() - Return a random set CPU from the AND of + * two cpumasks. * @src1: The first cpumask. * @src2: The second cpumask. * * Return: - * * A random set bit within [0, num_cpus) if at least one bit is set. + * * A random set bit within [0, num_cpus) from the AND of two cpumasks, if at + * least one bit is set. * * >= num_cpus if no bit is set. * * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. */ -__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) +__bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) { - return cpumask_any_and(src1, src2); + return cpumask_any_and_distribute(src1, src2); } __diag_pop(); @@ -438,8 +440,8 @@ BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) -BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU) -BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_SET8_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { -- cgit v1.2.3 From 904e6ddf4133c52fdb9654c2cd2ad90f320d48b9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 13 Jun 2023 18:38:21 +0300 Subject: bpf: Use scalar ids in mark_chain_precision() Change mark_chain_precision() to track precision in situations like below: r2 = unknown value ... --- state #0 --- ... r1 = r2 // r1 and r2 now share the same ID ... --- state #1 {r1.id = A, r2.id = A} --- ... if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1 ... --- state #2 {r1.id = A, r2.id = A} --- r3 = r10 r3 += r1 // need to mark both r1 and r2 At the beginning of the processing of each state, ensure that if a register with a scalar ID is marked as precise, all registers sharing this ID are also marked as precise. This property would be used by a follow-up change in regsafe(). Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230613153824.3324830-2-eddyz87@gmail.com --- kernel/bpf/verifier.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e38584d497c..064aef5cd186 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3779,6 +3779,96 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ } } +static bool idset_contains(struct bpf_idset *s, u32 id) +{ + u32 i; + + for (i = 0; i < s->count; ++i) + if (s->ids[i] == id) + return true; + + return false; +} + +static int idset_push(struct bpf_idset *s, u32 id) +{ + if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids))) + return -EFAULT; + s->ids[s->count++] = id; + return 0; +} + +static void idset_reset(struct bpf_idset *s) +{ + s->count = 0; +} + +/* Collect a set of IDs for all registers currently marked as precise in env->bt. + * Mark all registers with these IDs as precise. + */ +static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_idset *precise_ids = &env->idset_scratch; + struct backtrack_state *bt = &env->bt; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + DECLARE_BITMAP(mask, 64); + int i, fr; + + idset_reset(precise_ids); + + for (fr = bt->frame; fr >= 0; fr--) { + func = st->frame[fr]; + + bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (!reg->id || reg->type != SCALAR_VALUE) + continue; + if (idset_push(precise_ids, reg->id)) + return -EFAULT; + } + + bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) + break; + if (!is_spilled_scalar_reg(&func->stack[i])) + continue; + reg = &func->stack[i].spilled_ptr; + if (!reg->id) + continue; + if (idset_push(precise_ids, reg->id)) + return -EFAULT; + } + } + + for (fr = 0; fr <= st->curframe; ++fr) { + func = st->frame[fr]; + + for (i = BPF_REG_0; i < BPF_REG_10; ++i) { + reg = &func->regs[i]; + if (!reg->id) + continue; + if (!idset_contains(precise_ids, reg->id)) + continue; + bt_set_frame_reg(bt, fr, i); + } + for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) { + if (!is_spilled_scalar_reg(&func->stack[i])) + continue; + reg = &func->stack[i].spilled_ptr; + if (!reg->id) + continue; + if (!idset_contains(precise_ids, reg->id)) + continue; + bt_set_frame_slot(bt, fr, i); + } + } + + return 0; +} + /* * __mark_chain_precision() backtracks BPF program instruction sequence and * chain of verifier states making sure that register *regno* (if regno >= 0) @@ -3910,6 +4000,31 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bt->frame, last_idx, first_idx, subseq_idx); } + /* If some register with scalar ID is marked as precise, + * make sure that all registers sharing this ID are also precise. + * This is needed to estimate effect of find_equal_scalars(). + * Do this at the last instruction of each state, + * bpf_reg_state::id fields are valid for these instructions. + * + * Allows to track precision in situation like below: + * + * r2 = unknown value + * ... + * --- state #0 --- + * ... + * r1 = r2 // r1 and r2 now share the same ID + * ... + * --- state #1 {r1.id = A, r2.id = A} --- + * ... + * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1 + * ... + * --- state #2 {r1.id = A, r2.id = A} --- + * r3 = r10 + * r3 += r1 // need to mark both r1 and r2 + */ + if (mark_precise_scalar_ids(env, st)) + return -EFAULT; + if (last_idx < 0) { /* we are at the entry into subprog, which * is expected for global funcs, but only if -- cgit v1.2.3 From 1ffc85d9298e0ca0137ba65c93a786143fe167b8 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 13 Jun 2023 18:38:23 +0300 Subject: bpf: Verify scalar ids mapping in regsafe() using check_ids() Make sure that the following unsafe example is rejected by verifier: 1: r9 = ... some pointer with range X ... 2: r6 = ... unbound scalar ID=a ... 3: r7 = ... unbound scalar ID=b ... 4: if (r6 > r7) goto +1 5: r6 = r7 6: if (r6 > X) goto ... --- checkpoint --- 7: r9 += r7 8: *(u64 *)r9 = Y This example is unsafe because not all execution paths verify r7 range. Because of the jump at (4) the verifier would arrive at (6) in two states: I. r6{.id=b}, r7{.id=b} via path 1-6; II. r6{.id=a}, r7{.id=b} via path 1-4, 6. Currently regsafe() does not call check_ids() for scalar registers, thus from POV of regsafe() states (I) and (II) are identical. If the path 1-6 is taken by verifier first, and checkpoint is created at (6) the path [1-4, 6] would be considered safe. Changes in this commit: - check_ids() is modified to disallow mapping multiple old_id to the same cur_id. - check_scalar_ids() is added, unlike check_ids() it treats ID zero as a unique scalar ID. - check_scalar_ids() needs to generate temporary unique IDs, field 'tmp_id_gen' is added to bpf_verifier_env::idmap_scratch to facilitate this. - regsafe() is updated to: - use check_scalar_ids() for precise scalar registers. - compare scalar registers using memcmp only for explore_alu_limits branch. This simplifies control flow for scalar case, and has no measurable performance impact. - check_alu_op() is updated to avoid generating bpf_reg_state::id for constant scalar values when processing BPF_MOV. ID is needed to propagate range information for identical values, but there is nothing to propagate for constants. Fixes: 75748837b7e5 ("bpf: Propagate scalar ranges through register assignments.") Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230613153824.3324830-4-eddyz87@gmail.com --- kernel/bpf/verifier.c | 91 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 064aef5cd186..fa43dc8e85b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12934,12 +12934,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { struct bpf_reg_state *src_reg = regs + insn->src_reg; struct bpf_reg_state *dst_reg = regs + insn->dst_reg; + bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id && + !tnum_is_const(src_reg->var_off); if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 * copy register state to dest reg */ - if (src_reg->type == SCALAR_VALUE && !src_reg->id) + if (need_id) /* Assign src and dst registers the same ID * that will be used by find_equal_scalars() * to propagate min/max range. @@ -12958,7 +12960,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; - if (is_src_reg_u32 && !src_reg->id) + if (is_src_reg_u32 && need_id) src_reg->id = ++env->id_gen; copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared if src_reg is not in u32 range otherwise @@ -15114,8 +15116,9 @@ static bool range_within(struct bpf_reg_state *old, * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) { + struct bpf_id_pair *map = idmap->map; unsigned int i; /* either both IDs should be set or both should be zero */ @@ -15126,20 +15129,34 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) return true; for (i = 0; i < BPF_ID_MAP_SIZE; i++) { - if (!idmap[i].old) { + if (!map[i].old) { /* Reached an empty slot; haven't seen this id before */ - idmap[i].old = old_id; - idmap[i].cur = cur_id; + map[i].old = old_id; + map[i].cur = cur_id; return true; } - if (idmap[i].old == old_id) - return idmap[i].cur == cur_id; + if (map[i].old == old_id) + return map[i].cur == cur_id; + if (map[i].cur == cur_id) + return false; } /* We ran out of idmap slots, which should be impossible */ WARN_ON_ONCE(1); return false; } +/* Similar to check_ids(), but allocate a unique temporary ID + * for 'old_id' or 'cur_id' of zero. + * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid. + */ +static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) +{ + old_id = old_id ? old_id : ++idmap->tmp_id_gen; + cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; + + return check_ids(old_id, cur_id, idmap); +} + static void clean_func_state(struct bpf_verifier_env *env, struct bpf_func_state *st) { @@ -15238,7 +15255,7 @@ next: static bool regs_exact(const struct bpf_reg_state *rold, const struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) + struct bpf_idmap *idmap) { return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && @@ -15247,7 +15264,7 @@ static bool regs_exact(const struct bpf_reg_state *rold, /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) + struct bpf_reg_state *rcur, struct bpf_idmap *idmap) { if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ @@ -15284,15 +15301,42 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, switch (base_type(rold->type)) { case SCALAR_VALUE: - if (regs_exact(rold, rcur, idmap)) - return true; - if (env->explore_alu_limits) - return false; + if (env->explore_alu_limits) { + /* explore_alu_limits disables tnum_in() and range_within() + * logic and requires everything to be strict + */ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + check_scalar_ids(rold->id, rcur->id, idmap); + } if (!rold->precise) return true; - /* new val must satisfy old val knowledge */ + /* Why check_ids() for scalar registers? + * + * Consider the following BPF code: + * 1: r6 = ... unbound scalar, ID=a ... + * 2: r7 = ... unbound scalar, ID=b ... + * 3: if (r6 > r7) goto +1 + * 4: r6 = r7 + * 5: if (r6 > X) goto ... + * 6: ... memory operation using r7 ... + * + * First verification path is [1-6]: + * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; + * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark + * r7 <= X, because r6 and r7 share same id. + * Next verification path is [1-4, 6]. + * + * Instruction (6) would be reached in two states: + * I. r6{.id=b}, r7{.id=b} via path 1-6; + * II. r6{.id=a}, r7{.id=b} via path 1-4, 6. + * + * Use check_ids() to distinguish these states. + * --- + * Also verify that new value satisfies old value range knowledge. + */ return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); + tnum_in(rold->var_off, rcur->var_off) && + check_scalar_ids(rold->id, rcur->id, idmap); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: @@ -15338,7 +15382,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_id_pair *idmap) + struct bpf_func_state *cur, struct bpf_idmap *idmap) { int i, spi; @@ -15441,7 +15485,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, } static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, - struct bpf_id_pair *idmap) + struct bpf_idmap *idmap) { int i; @@ -15489,13 +15533,13 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], - env->idmap_scratch)) + &env->idmap_scratch)) return false; - if (!stacksafe(env, old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, &env->idmap_scratch)) return false; - if (!refsafe(old, cur, env->idmap_scratch)) + if (!refsafe(old, cur, &env->idmap_scratch)) return false; return true; @@ -15510,7 +15554,8 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; - memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + env->idmap_scratch.tmp_id_gen = env->id_gen; + memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. @@ -15528,7 +15573,7 @@ static bool states_equal(struct bpf_verifier_env *env, return false; if (old->active_lock.id && - !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch)) + !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch)) return false; if (old->active_rcu_lock != cur->active_rcu_lock) -- cgit v1.2.3 From 33457938a0c4e817ab6f9923e244b91289f47f54 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Wed, 14 Jun 2023 01:03:54 +0000 Subject: kallsyms: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20230614010354.1026096-1-azeemshaikh38@gmail.com --- kernel/kallsyms.c | 4 ++-- kernel/params.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 77747391f49b..ddb91d8edaae 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -716,7 +716,7 @@ static int get_ksymbol_bpf(struct kallsym_iter *iter) { int ret; - strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); + strscpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, @@ -736,7 +736,7 @@ static int get_ksymbol_bpf(struct kallsym_iter *iter) */ static int get_ksymbol_kprobe(struct kallsym_iter *iter) { - strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); iter->exported = 0; return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, &iter->value, &iter->type, diff --git a/kernel/params.c b/kernel/params.c index 6a7548979aa9..07d01f6ce9a2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -847,7 +847,7 @@ static void __init param_sysfs_builtin(void) name_len = 0; } else { name_len = dot - kp->name + 1; - strlcpy(modname, kp->name, name_len); + strscpy(modname, kp->name, name_len); } kernel_add_sysfs_param(modname, kp, name_len); } -- cgit v1.2.3 From 7cc148a32f1e7496e22c0005dd113a31d4a3b3d4 Mon Sep 17 00:00:00 2001 From: James Gowans Date: Thu, 8 Jun 2023 14:00:19 +0200 Subject: genirq: Expand doc for PENDING and REPLAY flags Adding a bit more info about what the flags are used for may help future code readers. Signed-off-by: James Gowans Cc: Thomas Gleixner Cc: Marc Zyngier Cc: Liao Chang Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230608120021.3273400-2-jgowans@amazon.com --- kernel/irq/internals.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5fdc0b557579..c443a0ddc07e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -47,9 +47,12 @@ enum { * detection * IRQS_POLL_INPROGRESS - polling in progress * IRQS_ONESHOT - irq is not unmasked in primary handler - * IRQS_REPLAY - irq is replayed + * IRQS_REPLAY - irq has been resent and will not be resent + * again until the handler has run and cleared + * this flag. * IRQS_WAITING - irq is waiting - * IRQS_PENDING - irq is pending and replayed later + * IRQS_PENDING - irq needs to be resent and should be resent + * at the next available opportunity. * IRQS_SUSPENDED - irq is suspended * IRQS_NMI - irq line is used to deliver NMIs * IRQS_SYSFS - descriptor has been added to sysfs -- cgit v1.2.3 From 9c15eeb5362c48dd27d51bd72e8873341fa9383c Mon Sep 17 00:00:00 2001 From: James Gowans Date: Thu, 8 Jun 2023 14:00:20 +0200 Subject: genirq: Allow fasteoi handler to resend interrupts on concurrent handling There is a class of interrupt controllers out there that, once they have signalled a given interrupt number, will still signal incoming instances of the *same* interrupt despite the original interrupt not having been EOIed yet. As long as the new interrupt reaches the *same* CPU, nothing bad happens, as that CPU still has its interrupts globally disabled, and we will only take the new interrupt once the interrupt has been EOIed. However, things become more "interesting" if an affinity change comes in while the interrupt is being handled. More specifically, while the per-irq lock is being dropped. This results in the affinity change taking place immediately. At this point, there is nothing that prevents the interrupt from firing on the new target CPU. We end-up with the interrupt running concurrently on two CPUs, which isn't a good thing. And that's where things become worse: the new CPU notices that the interrupt handling is in progress (irq_may_run() return false), and *drops the interrupt on the floor*. The whole race looks like this: CPU 0 | CPU 1 -----------------------------|----------------------------- interrupt start | handle_fasteoi_irq | set_affinity(CPU 1) handler | ... | interrupt start ... | handle_fasteoi_irq -> early out handle_fasteoi_irq return | interrupt end interrupt end | If the interrupt was an edge, too bad. The interrupt is lost, and the system will eventually die one way or another. Not great. A way to avoid this situation is to detect this problem at the point we handle the interrupt on the new target. Instead of dropping the interrupt, use the resend mechanism to force it to be replayed. Also, in order to limit the impact of this workaround to the pathetic architectures that require it, gate it behind a new irq flag aptly named IRQD_RESEND_WHEN_IN_PROGRESS. Suggested-by: Marc Zyngier Signed-off-by: James Gowans Cc: Thomas Gleixner Cc: Marc Zyngier Cc: KarimAllah Raslan Cc: Yipeng Zou Cc: Zhang Jianhua [maz: reworded commit mesage] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230608120021.3273400-3-jgowans@amazon.com --- kernel/irq/chip.c | 16 +++++++++++++++- kernel/irq/debugfs.c | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 49e7bc871fec..57cd8f475302 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -692,8 +692,16 @@ void handle_fasteoi_irq(struct irq_desc *desc) raw_spin_lock(&desc->lock); - if (!irq_may_run(desc)) + /* + * When an affinity change races with IRQ handling, the next interrupt + * can arrive on the new CPU before the original CPU has completed + * handling the previous one - it may need to be resent. + */ + if (!irq_may_run(desc)) { + if (irqd_needs_resend_when_in_progress(&desc->irq_data)) + desc->istate |= IRQS_PENDING; goto out; + } desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); @@ -715,6 +723,12 @@ void handle_fasteoi_irq(struct irq_desc *desc) cond_unmask_eoi_irq(desc, chip); + /* + * When the race described above happens this will resend the interrupt. + */ + if (unlikely(desc->istate & IRQS_PENDING)) + check_irq_resend(desc, false); + raw_spin_unlock(&desc->lock); return; out: diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index bbcaac64038e..5971a66be034 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -133,6 +133,8 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND), + + BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS), }; static const struct irq_bit_descr irqdesc_states[] = { -- cgit v1.2.3 From a707df30c9438a9d4d0a43ae7f22b59b078f94c4 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 11 Jun 2023 08:25:35 -0400 Subject: sched/fair: Rename variable cpu_util eff_util cppcheck reports kernel/sched/fair.c:7436:17: style: Local variable 'cpu_util' shadows outer function [shadowFunction] unsigned long cpu_util; ^ Clean this up by renaming the variable to eff_util Signed-off-by: Tom Rix Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20230611122535.183654-1-trix@redhat.com --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6189d1a45635..7666dbc2b788 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7433,7 +7433,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; unsigned long util = cpu_util(cpu, p, dst_cpu, 1); - unsigned long cpu_util; + unsigned long eff_util; /* * Performance domain frequency: utilization clamping @@ -7442,8 +7442,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); + eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + max_util = max(max_util, eff_util); } return min(max_util, eenv->cpu_cap); -- cgit v1.2.3 From 0cce0fde499a92c726cd2e24f7763644f7c9f971 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 3 Jun 2023 15:36:45 +0800 Subject: sched/topology: Mark set_sched_topology() __init All callers of set_sched_topology() are within __init section. Mark it __init too. Signed-off-by: Miaohe Lin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20230603073645.1173332-1-linmiaohe@huawei.com --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ca4472281c28..cb92dc5f5646 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1681,7 +1681,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) -void set_sched_topology(struct sched_domain_topology_level *tl) +void __init set_sched_topology(struct sched_domain_topology_level *tl) { if (WARN_ON_ONCE(sched_smp_initialized)) return; -- cgit v1.2.3 From ef73d6a4ef0b35524125c3cfc6deafc26a0c966a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= Date: Fri, 2 Jun 2023 21:23:46 +0000 Subject: sched/wait: Fix a kthread_park race with wait_woken() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kthread_park and wait_woken have a similar race that kthread_stop and wait_woken used to have before it was fixed in commit cb6538e740d7 ("sched/wait: Fix a kthread race with wait_woken()"). Extend that fix to also cover kthread_park. [jstultz: Made changes suggested by Peter to optimize memory loads] Signed-off-by: Arve HjønnevÃ¥g Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20230602212350.535358-1-jstultz@google.com --- kernel/kthread.c | 10 ++++++++++ kernel/sched/wait.c | 7 +------ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 490792b1066e..07a057086d26 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -182,6 +182,16 @@ bool kthread_should_park(void) } EXPORT_SYMBOL_GPL(kthread_should_park); +bool kthread_should_stop_or_park(void) +{ + struct kthread *kthread = __to_kthread(current); + + if (!kthread) + return false; + + return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); +} + /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 133b74730738..48c53e4739ea 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -425,11 +425,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i } EXPORT_SYMBOL(autoremove_wake_function); -static inline bool is_kthread_should_stop(void) -{ - return (current->flags & PF_KTHREAD) && kthread_should_stop(); -} - /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * @@ -459,7 +454,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) * or woken_wake_function() sees our store to current->state. */ set_current_state(mode); /* A */ - if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From ab5d47bd41b1db82c295b0e751e2b822b43a4b5a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 14 Jun 2023 10:34:30 +0200 Subject: bpf: Remove in_atomic() from bpf_link_put(). bpf_free_inode() is invoked as a RCU callback. Usually RCU callbacks are invoked within softirq context. By setting rcutree.use_softirq=0 boot option the RCU callbacks will be invoked in a per-CPU kthread with bottom halves disabled which implies a RCU read section. On PREEMPT_RT the context remains fully preemptible. The RCU read section however does not allow schedule() invocation. The latter happens in mutex_lock() performed by bpf_trampoline_unlink_prog() originated from bpf_link_put(). It was pointed out that the bpf_link_put() invocation should not be delayed if originated from close(). It was also pointed out that other invocations from within a syscall should also avoid the workqueue. Everyone else should use workqueue by default to remain safe in the future (while auditing the code, every caller was preemptible except for the RCU case). Let bpf_link_put() use the worker unconditionally. Add bpf_link_put_direct() which will directly free the resources and is used by close() and from within __sys_bpf(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230614083430.oENawF8f@linutronix.de --- kernel/bpf/syscall.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 92a57efc77de..12955415d376 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2793,28 +2793,31 @@ static void bpf_link_put_deferred(struct work_struct *work) bpf_link_free(link); } -/* bpf_link_put can be called from atomic context, but ensures that resources - * are freed from process context +/* bpf_link_put might be called from atomic context. It needs to be called + * from sleepable context in order to acquire sleeping locks during the process. */ void bpf_link_put(struct bpf_link *link) { if (!atomic64_dec_and_test(&link->refcnt)) return; - if (in_atomic()) { - INIT_WORK(&link->work, bpf_link_put_deferred); - schedule_work(&link->work); - } else { - bpf_link_free(link); - } + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); } EXPORT_SYMBOL(bpf_link_put); +static void bpf_link_put_direct(struct bpf_link *link) +{ + if (!atomic64_dec_and_test(&link->refcnt)) + return; + bpf_link_free(link); +} + static int bpf_link_release(struct inode *inode, struct file *filp) { struct bpf_link *link = filp->private_data; - bpf_link_put(link); + bpf_link_put_direct(link); return 0; } @@ -4787,7 +4790,7 @@ out_put_progs: if (ret) bpf_prog_put(new_prog); out_put_link: - bpf_link_put(link); + bpf_link_put_direct(link); return ret; } @@ -4810,7 +4813,7 @@ static int link_detach(union bpf_attr *attr) else ret = -EOPNOTSUPP; - bpf_link_put(link); + bpf_link_put_direct(link); return ret; } @@ -4880,7 +4883,7 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) fd = bpf_link_new_fd(link); if (fd < 0) - bpf_link_put(link); + bpf_link_put_direct(link); return fd; } @@ -4957,7 +4960,7 @@ static int bpf_iter_create(union bpf_attr *attr) return PTR_ERR(link); err = bpf_iter_new_fd(link); - bpf_link_put(link); + bpf_link_put_direct(link); return err; } -- cgit v1.2.3 From 949fa3f11ced2a5c8e3737e73b09676adf4b322b Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Thu, 15 Jun 2023 03:59:45 -0300 Subject: trace,smp: Add tracepoints around remotelly called functions The recently added ipi_send_{cpu,cpumask} tracepoints allow finding sources of IPIs targeting CPUs running latency-sensitive applications. For NOHZ_FULL CPUs, all IPIs are interference, and those tracepoints are sufficient to find them and work on getting rid of them. In some setups however, not *all* IPIs are to be suppressed, but long-running IPI callbacks can still be problematic. Add a pair of tracepoints to mark the start and end of processing a CSD IPI callback, similar to what exists for softirq, workqueue or timer callbacks. Signed-off-by: Leonardo Bras Tested-and-reviewed-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230615065944.188876-5-leobras@redhat.com --- kernel/smp.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 71dce748dbf0..1fa01a83fd83 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -27,6 +27,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS #include "smpboot.h" #include "sched/smp.h" @@ -121,6 +124,14 @@ send_call_function_ipi_mask(struct cpumask *mask) arch_send_call_function_ipi_mask(mask); } +static __always_inline void +csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd) +{ + trace_csd_function_entry(func, csd); + func(info); + trace_csd_function_exit(func, csd); +} + #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); @@ -375,7 +386,7 @@ static int generic_exec_single(int cpu, struct __call_single_data *csd) csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); - func(info); + csd_do_func(func, info, NULL); csd_lock_record(NULL); local_irq_restore(flags); return 0; @@ -477,7 +488,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) } csd_lock_record(csd); - func(info); + csd_do_func(func, info, csd); csd_unlock(csd); csd_lock_record(NULL); } else { @@ -508,7 +519,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) csd_lock_record(csd); csd_unlock(csd); - func(info); + csd_do_func(func, info, csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); @@ -522,8 +533,10 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) /* * Third; only CSD_TYPE_TTWU is left, issue those. */ - if (entry) - sched_ttwu_pending(entry); + if (entry) { + csd = llist_entry(entry, typeof(*csd), node.llist); + csd_do_func(sched_ttwu_pending, entry, csd); + } } @@ -816,7 +829,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, unsigned long flags; local_irq_save(flags); - func(info); + csd_do_func(func, info, NULL); local_irq_restore(flags); } -- cgit v1.2.3 From bf5a8c26ad7caf0772a1cd48c8a0924e48bdbaf0 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Thu, 15 Jun 2023 03:59:47 -0300 Subject: trace,smp: Add tracepoints for scheduling remotelly called functions Add a tracepoint for when a CSD is queued to a remote CPU's call_single_queue. This allows finding exactly which CPU queued a given CSD when looking at a csd_function_{entry,exit} event, and also enables us to accurately measure IPI delivery time with e.g. a synthetic event: $ echo 'hist:keys=cpu,csd.hex:ts=common_timestamp.usecs' >\ /sys/kernel/tracing/events/smp/csd_queue_cpu/trigger $ echo 'csd_latency unsigned int dst_cpu; unsigned long csd; u64 time' >\ /sys/kernel/tracing/synthetic_events $ echo \ 'hist:keys=common_cpu,csd.hex:'\ 'time=common_timestamp.usecs-$ts:'\ 'onmatch(smp.csd_queue_cpu).trace(csd_latency,common_cpu,csd,$time)' >\ /sys/kernel/tracing/events/smp/csd_function_entry/trigger $ trace-cmd record -e 'synthetic:csd_latency' hackbench $ trace-cmd report <...>-467 [001] 21.824263: csd_queue_cpu: cpu=0 callsite=try_to_wake_up+0x2ea func=sched_ttwu_pending csd=0xffff8880076148b8 <...>-467 [001] 21.824280: ipi_send_cpu: cpu=0 callsite=try_to_wake_up+0x2ea callback=generic_smp_call_function_single_interrupt+0x0 <...>-489 [000] 21.824299: csd_function_entry: func=sched_ttwu_pending csd=0xffff8880076148b8 <...>-489 [000] 21.824320: csd_latency: dst_cpu=0, csd=18446612682193848504, time=36 Suggested-by: Valentin Schneider Signed-off-by: Leonardo Bras Tested-and-reviewed-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230615065944.188876-7-leobras@redhat.com --- kernel/smp.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 1fa01a83fd83..385179dae360 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -340,7 +340,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ - if (trace_ipi_send_cpu_enabled()) { + if (trace_csd_queue_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; @@ -348,7 +348,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; - trace_ipi_send_cpu(cpu, _RET_IP_, func); + trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* @@ -741,7 +741,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; - int nr_cpus = 0, nr_queued = 0; + int nr_cpus = 0; bool run_remote = false; bool run_local = false; @@ -799,21 +799,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif + trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); + if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; } - nr_queued++; } - /* - * Trace each smp_function_call_*() as an IPI, actual IPIs - * will be traced with func==generic_smp_call_function_single_ipi(). - */ - if (nr_queued) - trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func); - /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the -- cgit v1.2.3 From 6a9d623aad89539eca71eb264db6b9d538620ad5 Mon Sep 17 00:00:00 2001 From: Vineeth Pillai Date: Tue, 30 May 2023 09:55:25 -0400 Subject: sched/deadline: Fix bandwidth reclaim equation in GRUB According to the GRUB[1] rule, the runtime is depreciated as: "dq = -max{u, (1 - Uinact - Uextra)} dt" (1) To guarantee that deadline tasks doesn't starve lower class tasks, we do not allocate the full bandwidth of the cpu to deadline tasks. Maximum bandwidth usable by deadline tasks is denoted by "Umax". Considering Umax, equation (1) becomes: "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt" (2) Current implementation has a minor bug in equation (2), which this patch fixes. The reclamation logic is verified by a sample program which creates multiple deadline threads and observing their utilization. The tests were run on an isolated cpu(isolcpus=3) on a 4 cpu system. Tests on 6.3.0 ============== RUN 1: runtime=7ms, deadline=period=10ms, RT capacity = 95% TID[693]: RECLAIM=1, (r=7ms, d=10ms, p=10ms), Util: 93.33 TID[693]: RECLAIM=1, (r=7ms, d=10ms, p=10ms), Util: 93.35 RUN 2: runtime=1ms, deadline=period=100ms, RT capacity = 95% TID[708]: RECLAIM=1, (r=1ms, d=100ms, p=100ms), Util: 16.69 TID[708]: RECLAIM=1, (r=1ms, d=100ms, p=100ms), Util: 16.69 RUN 3: 2 tasks Task 1: runtime=1ms, deadline=period=10ms Task 2: runtime=1ms, deadline=period=100ms TID[631]: RECLAIM=1, (r=1ms, d=10ms, p=10ms), Util: 62.67 TID[632]: RECLAIM=1, (r=1ms, d=100ms, p=100ms), Util: 6.37 TID[631]: RECLAIM=1, (r=1ms, d=10ms, p=10ms), Util: 62.38 TID[632]: RECLAIM=1, (r=1ms, d=100ms, p=100ms), Util: 6.23 As seen above, the reclamation doesn't reclaim the maximum allowed bandwidth and as the bandwidth of tasks gets smaller, the reclaimed bandwidth also comes down. Tests with this patch applied ============================= RUN 1: runtime=7ms, deadline=period=10ms, RT capacity = 95% TID[608]: RECLAIM=1, (r=7ms, d=10ms, p=10ms), Util: 95.19 TID[608]: RECLAIM=1, (r=7ms, d=10ms, p=10ms), Util: 95.16 RUN 2: runtime=1ms, deadline=period=100ms, RT capacity = 95% TID[616]: RECLAIM=1, (r=1ms, d=100ms, p=100ms), Util: 95.27 TID[616]: RECLAIM=1, (r=1ms, d=100ms, p=100ms), Util: 95.21 RUN 3: 2 tasks Task 1: runtime=1ms, deadline=period=10ms Task 2: runtime=1ms, deadline=period=100ms TID[620]: RECLAIM=1, (r=1ms, d=10ms, p=10ms), Util: 86.64 TID[621]: RECLAIM=1, (r=1ms, d=100ms, p=100ms), Util: 8.66 TID[620]: RECLAIM=1, (r=1ms, d=10ms, p=10ms), Util: 86.45 TID[621]: RECLAIM=1, (r=1ms, d=100ms, p=100ms), Util: 8.73 Running tasks on all cpus allowing for migration also showed that the utilization is reclaimed to the maximum. Running 10 tasks on 3 cpus SCHED_FLAG_RECLAIM - top shows: %Cpu0 : 94.6 us, 0.0 sy, 0.0 ni, 5.4 id, 0.0 wa %Cpu1 : 95.2 us, 0.0 sy, 0.0 ni, 4.8 id, 0.0 wa %Cpu2 : 95.8 us, 0.0 sy, 0.0 ni, 4.2 id, 0.0 wa [1]: Abeni, Luca & Lipari, Giuseppe & Parri, Andrea & Sun, Youcheng. (2015). Parallel and sequential reclaiming in multicore real-time global scheduling. Signed-off-by: Vineeth Pillai (Google) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20230530135526.2385378-1-vineeth@bitbyteword.org --- kernel/sched/deadline.c | 50 +++++++++++++++++++++++-------------------------- kernel/sched/sched.h | 6 ++++++ 2 files changed, 29 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f827067ad03b..e41a36bd66a6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1253,43 +1253,39 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) } /* - * This function implements the GRUB accounting rule: - * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as - * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * This function implements the GRUB accounting rule. According to the + * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt", + * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt", * where u is the utilization of the task, Umax is the maximum reclaimable * utilization, Uinact is the (per-runqueue) inactive utilization, computed * as the difference between the "total runqueue utilization" and the - * runqueue active utilization, and Uextra is the (per runqueue) extra + * "runqueue active utilization", and Uextra is the (per runqueue) extra * reclaimable utilization. - * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations - * multiplied by 2^BW_SHIFT, the result has to be shifted right by - * BW_SHIFT. - * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, - * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. - * Since delta is a 64 bit variable, to have an overflow its value - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. - * So, overflow is not an issue here. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied + * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw + * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value should be + * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is + * not an issue here. */ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { - u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; - u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ /* - * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, - * we compare u_inact + rq->dl.extra_bw with - * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because - * u_inact + rq->dl.extra_bw can be larger than - * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative - * leading to wrong results) + * Instead of computing max{u, (u_max - u_inact - u_extra)}, we + * compare u_inact + u_extra with u_max - u, because u_inact + u_extra + * can be larger than u_max. So, u_max - u_inact - u_extra would be + * negative leading to wrong results. */ - if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) - u_act = u_act_min; + if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw) + u_act = dl_se->dl_bw; else - u_act = BW_UNIT - u_inact - rq->dl.extra_bw; + u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw; + u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT; return (delta * u_act) >> BW_SHIFT; } @@ -2788,12 +2784,12 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; - dl_rq->extra_bw = 1 << BW_SHIFT; + dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT; } else { dl_rq->bw_ratio = to_ratio(global_rt_runtime(), global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); - dl_rq->extra_bw = to_ratio(global_rt_period(), - global_rt_runtime()); + dl_rq->max_bw = dl_rq->extra_bw = + to_ratio(global_rt_period(), global_rt_runtime()); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 556496c77dc2..36e23e49817d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -747,6 +747,12 @@ struct dl_rq { u64 this_bw; u64 extra_bw; + /* + * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM + * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB). + */ + u64 max_bw; + /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. -- cgit v1.2.3 From cab3ecaed5cdcc9c36a96874b4c45056a46ece45 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Tue, 13 Jun 2023 16:20:09 +0800 Subject: sched/core: Fixed missing rq clock update before calling set_rq_offline() When using a cpufreq governor that uses cpufreq_add_update_util_hook(), it is possible to trigger a missing update_rq_clock() warning for the CPU hotplug path: rq_attach_root() set_rq_offline() rq_offline_rt() __disable_runtime() sched_rt_rq_enqueue() enqueue_top_rt_rq() cpufreq_update_util() data->func(data, rq_clock(rq), flags) Move update_rq_clock() from sched_cpu_deactivate() (one of it's callers) into set_rq_offline() such that it covers all set_rq_offline() usage. Additionally change rq_attach_root() to use rq_lock_irqsave() so that it will properly manage the runqueue clock flags. Suggested-by: Ben Segall Signed-off-by: Hao Jia Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20230613082012.49615-2-jiahao.os@bytedance.com --- kernel/sched/core.c | 2 +- kernel/sched/topology.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ac38225e6d09..442efe59d188 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9585,6 +9585,7 @@ void set_rq_offline(struct rq *rq) if (rq->online) { const struct sched_class *class; + update_rq_clock(rq); for_each_class(class) { if (class->rq_offline) class->rq_offline(rq); @@ -9726,7 +9727,6 @@ int sched_cpu_deactivate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { - update_rq_clock(rq); BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index cb92dc5f5646..d3a3b2646ec4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -487,9 +487,9 @@ static void free_rootdomain(struct rcu_head *rcu) void rq_attach_root(struct rq *rq, struct root_domain *rd) { struct root_domain *old_rd = NULL; - unsigned long flags; + struct rq_flags rf; - raw_spin_rq_lock_irqsave(rq, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { old_rd = rq->rd; @@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - raw_spin_rq_unlock_irqrestore(rq, flags); + rq_unlock_irqrestore(rq, &rf); if (old_rd) call_rcu(&old_rd->rcu, free_rootdomain); -- cgit v1.2.3 From 96500560f0c73c71bca1b27536c6254fa0e8ce37 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Tue, 13 Jun 2023 16:20:10 +0800 Subject: sched/core: Avoid double calling update_rq_clock() in __balance_push_cpu_stop() There is a double update_rq_clock() invocation: __balance_push_cpu_stop() update_rq_clock() __migrate_task() update_rq_clock() Sadly select_fallback_rq() also needs update_rq_clock() for __do_set_cpus_allowed(), it is not possible to remove the update from __balance_push_cpu_stop(). So remove it from __migrate_task() and ensure all callers of this function call update_rq_clock() prior to calling it. Signed-off-by: Hao Jia Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20230613082012.49615-3-jiahao.os@bytedance.com --- kernel/sched/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 442efe59d188..c7db597e8175 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2546,7 +2546,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, if (!is_cpu_allowed(p, dest_cpu)) return rq; - update_rq_clock(rq); rq = move_queued_task(rq, rf, p, dest_cpu); return rq; @@ -2604,10 +2603,12 @@ static int migration_cpu_stop(void *data) goto out; } - if (task_on_rq_queued(p)) + if (task_on_rq_queued(p)) { + update_rq_clock(rq); rq = __migrate_task(rq, &rf, p, arg->dest_cpu); - else + } else { p->wake_cpu = arg->dest_cpu; + } /* * XXX __migrate_task() can fail, at which point we might end -- cgit v1.2.3 From ebb83d84e49b54369b0db67136a5fe1087124dcc Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Tue, 13 Jun 2023 16:20:11 +0800 Subject: sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() After commit 8ad075c2eb1f ("sched: Async unthrottling for cfs bandwidth"), we may update the rq clock multiple times in the loop of __cfsb_csd_unthrottle(). A prior (although less common) instance of this problem exists in unthrottle_offline_cfs_rqs(). Cure both by ensuring update_rq_clock() is called before the loop and setting RQCF_ACT_SKIP during the loop, to supress further updates. The alternative would be pulling update_rq_clock() out of unthrottle_cfs_rq(), but that gives an even bigger mess. Fixes: 8ad075c2eb1f ("sched: Async unthrottling for cfs bandwidth") Reviewed-By: Ben Segall Suggested-by: Vincent Guittot Signed-off-by: Hao Jia Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20230613082012.49615-4-jiahao.os@bytedance.com --- kernel/sched/fair.c | 18 ++++++++++++++++++ kernel/sched/sched.h | 22 ++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7666dbc2b788..a80a73909dc2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5576,6 +5576,14 @@ static void __cfsb_csd_unthrottle(void *arg) rq_lock(rq, &rf); + /* + * Iterating over the list can trigger several call to + * update_rq_clock() in unthrottle_cfs_rq(). + * Do it once and skip the potential next ones. + */ + update_rq_clock(rq); + rq_clock_start_loop_update(rq); + /* * Since we hold rq lock we're safe from concurrent manipulation of * the CSD list. However, this RCU critical section annotates the @@ -5595,6 +5603,7 @@ static void __cfsb_csd_unthrottle(void *arg) rcu_read_unlock(); + rq_clock_stop_loop_update(rq); rq_unlock(rq, &rf); } @@ -6115,6 +6124,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) lockdep_assert_rq_held(rq); + /* + * The rq clock has already been updated in the + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; @@ -6137,6 +6153,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); } #else /* CONFIG_CFS_BANDWIDTH */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 36e23e49817d..50d4b61aef3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1546,6 +1546,28 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) rq->clock_update_flags &= ~RQCF_REQ_SKIP; } +/* + * During cpu offlining and rq wide unthrottling, we can trigger + * an update_rq_clock() for several cfs and rt runqueues (Typically + * when using list_for_each_entry_*) + * rq_clock_start_loop_update() can be called after updating the clock + * once and before iterating over the list to prevent multiple update. + * After the iterative traversal, we need to call rq_clock_stop_loop_update() + * to clear RQCF_ACT_SKIP of rq->clock_update_flags. + */ +static inline void rq_clock_start_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + rq->clock_update_flags |= RQCF_ACT_SKIP; +} + +static inline void rq_clock_stop_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags &= ~RQCF_ACT_SKIP; +} + struct rq_flags { unsigned long flags; struct pin_cookie cookie; -- cgit v1.2.3 From 8091f56ee4e51037662590edc0b0e44807fbab4d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 22:04:28 +0200 Subject: irqdomain: Include internals.h for function prototypes irq_domain_debugfs_init() is defined in irqdomain.c, but the declaration is in a header that is not included here: kernel/irq/irqdomain.c:1965:13: error: no previous prototype for 'irq_domain_debugfs_init' [-Werror=missing-prototypes] Signed-off-by: Arnd Bergmann Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230516200432.554240-1-arnd@kernel.org --- kernel/irq/irqdomain.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f34760a1e222..5bd01624e447 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1915,6 +1915,8 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS +#include "internals.h" + static struct dentry *domain_dir; static void -- cgit v1.2.3 From 9d9e522010eb5685d8b53e8a24320653d9d4cbbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Jun 2023 22:16:34 +0200 Subject: posix-timers: Prevent RT livelock in itimer_delete() itimer_delete() has a retry loop when the timer is concurrently expired. On non-RT kernels this just spin-waits until the timer callback has completed, except for posix CPU timers which have HAVE_POSIX_CPU_TIMERS_TASK_WORK enabled. In that case and on RT kernels the existing task could live lock when preempting the task which does the timer delivery. Replace spin_unlock() with an invocation of timer_wait_running() to handle it the same way as the other retry loops in the posix timer code. Fixes: ec8f954a40da ("posix-timers: Use a callback for cancel synchronization on PREEMPT_RT") Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/87v8g7c50d.ffs@tglx --- kernel/time/posix-timers.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 808a247205a9..ed3c4a954398 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1037,27 +1037,52 @@ retry_delete: } /* - * return timer owned by the process, used by exit_itimers + * Delete a timer if it is armed, remove it from the hash and schedule it + * for RCU freeing. */ static void itimer_delete(struct k_itimer *timer) { -retry_delete: - spin_lock_irq(&timer->it_lock); + unsigned long flags; + + /* + * irqsave is required to make timer_wait_running() work. + */ + spin_lock_irqsave(&timer->it_lock, flags); +retry_delete: + /* + * Even if the timer is not longer accessible from other tasks + * it still might be armed and queued in the underlying timer + * mechanism. Worse, that timer mechanism might run the expiry + * function concurrently. + */ if (timer_delete_hook(timer) == TIMER_RETRY) { - spin_unlock_irq(&timer->it_lock); + /* + * Timer is expired concurrently, prevent livelocks + * and pointless spinning on RT. + * + * timer_wait_running() drops timer::it_lock, which opens + * the possibility for another task to delete the timer. + * + * That's not possible here because this is invoked from + * do_exit() only for the last thread of the thread group. + * So no other task can access and delete that timer. + */ + if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) + return; + goto retry_delete; } list_del(&timer->list); - spin_unlock_irq(&timer->it_lock); + spin_unlock_irqrestore(&timer->it_lock, flags); release_posix_timer(timer, IT_ID_SET); } /* - * This is called by do_exit or de_thread, only when nobody else can - * modify the signal->posix_timers list. Yet we need sighand->siglock - * to prevent the race with /proc/pid/timers. + * Invoked from do_exit() when the last thread of a thread group exits. + * At that point no other task can access the timers of the dying + * task anymore. */ void exit_itimers(struct task_struct *tsk) { @@ -1067,10 +1092,12 @@ void exit_itimers(struct task_struct *tsk) if (list_empty(&tsk->signal->posix_timers)) return; + /* Protect against concurrent read via /proc/$PID/timers */ spin_lock_irq(&tsk->sighand->siglock); list_replace_init(&tsk->signal->posix_timers, &timers); spin_unlock_irq(&tsk->sighand->siglock); + /* The timers are not longer accessible via tsk::signal */ while (!list_empty(&timers)) { tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); -- cgit v1.2.3 From 8ce8849dd1e78dadcee0ec9acbd259d239b7069f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Jun 2023 20:58:47 +0200 Subject: posix-timers: Ensure timer ID search-loop limit is valid posix_timer_add() tries to allocate a posix timer ID by starting from the cached ID which was stored by the last successful allocation. This is done in a loop searching the ID space for a free slot one by one. The loop has to terminate when the search wrapped around to the starting point. But that's racy vs. establishing the starting point. That is read out lockless, which leads to the following problem: CPU0 CPU1 posix_timer_add() start = sig->posix_timer_id; lock(hash_lock); ... posix_timer_add() if (++sig->posix_timer_id < 0) start = sig->posix_timer_id; sig->posix_timer_id = 0; So CPU1 can observe a negative start value, i.e. -1, and the loop break never happens because the condition can never be true: if (sig->posix_timer_id == start) break; While this is unlikely to ever turn into an endless loop as the ID space is huge (INT_MAX), the racy read of the start value caught the attention of KCSAN and Dmitry unearthed that incorrectness. Rewrite it so that all id operations are under the hash lock. Reported-by: syzbot+5c54bd3eb218bb595aa9@syzkaller.appspotmail.com Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/87bkhzdn6g.ffs@tglx --- kernel/time/posix-timers.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ed3c4a954398..2d6cf93ca370 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -140,25 +140,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id) static int posix_timer_add(struct k_itimer *timer) { struct signal_struct *sig = current->signal; - int first_free_id = sig->posix_timer_id; struct hlist_head *head; - int ret = -ENOENT; + unsigned int cnt, id; - do { + /* + * FIXME: Replace this by a per signal struct xarray once there is + * a plan to handle the resulting CRIU regression gracefully. + */ + for (cnt = 0; cnt <= INT_MAX; cnt++) { spin_lock(&hash_lock); - head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; - if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + id = sig->next_posix_timer_id; + + /* Write the next ID back. Clamp it to the positive space */ + sig->next_posix_timer_id = (id + 1) & INT_MAX; + + head = &posix_timers_hashtable[hash(sig, id)]; + if (!__posix_timers_find(head, sig, id)) { hlist_add_head_rcu(&timer->t_hash, head); - ret = sig->posix_timer_id; + spin_unlock(&hash_lock); + return id; } - if (++sig->posix_timer_id < 0) - sig->posix_timer_id = 0; - if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) - /* Loop over all possible ids completed */ - ret = -EAGAIN; spin_unlock(&hash_lock); - } while (ret == -ENOENT); - return ret; + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) -- cgit v1.2.3 From 7d9909026645064cd31b20cee5939a0f72282261 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:00 +0200 Subject: posix-timers: Clarify timer_wait_running() comment Explain it better and add the CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y aspect for completeness. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183312.985681995@linutronix.de --- kernel/time/posix-timers.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2d6cf93ca370..0c61f29ef5d5 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -836,10 +836,18 @@ static void common_timer_wait_running(struct k_itimer *timer) } /* - * On PREEMPT_RT this prevent priority inversion against softirq kthread in - * case it gets preempted while executing a timer callback. See comments in - * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a - * cpu_relax(). + * On PREEMPT_RT this prevents priority inversion and a potential livelock + * against the ksoftirqd thread in case that ksoftirqd gets preempted while + * executing a hrtimer callback. + * + * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this + * just results in a cpu_relax(). + * + * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is + * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this + * prevents spinning on an eventually scheduled out task and a livelock + * when the task which tries to delete or disarm the timer has preempted + * the task which runs the expiry in task work context. */ static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags) -- cgit v1.2.3 From 8d44b958a1a088195524c884f0437660e0522380 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:01 +0200 Subject: posix-timers: Cleanup comments about timer ID tracking Describe the hash table properly and remove the IDR leftover comments. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.038444551@linutronix.de --- kernel/time/posix-timers.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0c61f29ef5d5..79909a2ac3e4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -35,20 +35,17 @@ #include "timekeeping.h" #include "posix-timers.h" -/* - * Management arrays for POSIX timers. Timers are now kept in static hash table - * with 512 entries. - * Timer ids are allocated by local routine, which selects proper hash head by - * key, constructed from current->signal address and per signal struct counter. - * This keeps timer ids unique per process, but now they can intersect between - * processes. - */ +static struct kmem_cache *posix_timers_cache; /* - * Lets keep our timers in a slab cache :-) + * Timers are managed in a hash table for lockless lookup. The hash key is + * constructed from current::signal and the timer ID and the timer is + * matched against current::signal and the timer ID when walking the hash + * bucket list. + * + * This allows checkpoint/restore to reconstruct the exact timer IDs for + * a process. */ -static struct kmem_cache *posix_timers_cache; - static DEFINE_HASHTABLE(posix_timers_hashtable, 9); static DEFINE_SPINLOCK(hash_lock); @@ -65,15 +62,6 @@ static const struct k_clock clock_realtime, clock_monotonic; #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -/* - * The timer ID is turned into a timer address by idr_find(). - * Verifying a valid ID consists of: - * - * a) checking that idr_find() returns other than -1. - * b) checking that the timer id matches the one in the timer itself. - * c) that the timer owner is in the callers thread group. - */ - /* * CLOCKs: The POSIX standard calls for a couple of clocks and allows us * to implement others. This structure defines the various -- cgit v1.2.3 From ae88967d71f1b4ffb6e48043993d37a106da8109 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:03 +0200 Subject: posix-timers: Add comments about timer lookup Document how the timer ID validation in the hash table works. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.091081515@linutronix.de --- kernel/time/posix-timers.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 79909a2ac3e4..d7890ac703ae 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -506,6 +506,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EAGAIN; spin_lock_init(&new_timer->it_lock); + + /* + * Add the timer to the hash table. The timer is not yet valid + * because new_timer::it_signal is still NULL. The timer id is also + * not yet visible to user space. + */ new_timer_id = posix_timer_add(new_timer); if (new_timer_id < 0) { error = new_timer_id; @@ -551,6 +557,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; spin_lock_irq(¤t->sighand->siglock); + /* This makes the timer valid in the hash table */ new_timer->it_signal = current->signal; list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); @@ -597,13 +604,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -/* - * Locking issues: We need to protect the result of the id look up until - * we get the timer locked down so it is not deleted under us. The - * removal is done under the idr spinlock so we use that here to bridge - * the find to the timer lock. To avoid a dead lock, the timer id MUST - * be release with out holding the timer lock. - */ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; @@ -615,10 +615,35 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) if ((unsigned long long)timer_id > INT_MAX) return NULL; + /* + * The hash lookup and the timers are RCU protected. + * + * Timers are added to the hash in invalid state where + * timr::it_signal == NULL. timer::it_signal is only set after the + * rest of the initialization succeeded. + * + * Timer destruction happens in steps: + * 1) Set timr::it_signal to NULL with timr::it_lock held + * 2) Release timr::it_lock + * 3) Remove from the hash under hash_lock + * 4) Call RCU for removal after the grace period + * + * Holding rcu_read_lock() accross the lookup ensures that + * the timer cannot be freed. + * + * The lookup validates locklessly that timr::it_signal == + * current::it_signal and timr::it_id == @timer_id. timr::it_id + * can't change, but timr::it_signal becomes NULL during + * destruction. + */ rcu_read_lock(); timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); + /* + * Validate under timr::it_lock that timr::it_signal is + * still valid. Pairs with #1 above. + */ if (timr->it_signal == current->signal) { rcu_read_unlock(); return timr; -- cgit v1.2.3 From 028cf5eaa12846c4e32104132ff70ca1cd6f5943 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:05 +0200 Subject: posix-timers: Annotate concurrent access to k_itimer:: It_signal k_itimer::it_signal is read lockless in the RCU protected hash lookup, but it can be written concurrently in the timer_create() and timer_delete() path. Annotate these places with READ_ONCE() and WRITE_ONCE() Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.143596887@linutronix.de --- kernel/time/posix-timers.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d7890ac703ae..de3fca8dae55 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -109,9 +109,9 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, { struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, - lockdep_is_held(&hash_lock)) { - if ((timer->it_signal == sig) && (timer->it_id == id)) + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + /* timer->it_signal can be set concurrently */ + if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; } return NULL; @@ -558,7 +558,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_irq(¤t->sighand->siglock); /* This makes the timer valid in the hash table */ - new_timer->it_signal = current->signal; + WRITE_ONCE(new_timer->it_signal, current->signal); list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); @@ -1052,10 +1052,10 @@ retry_delete: list_del(&timer->list); spin_unlock(¤t->sighand->siglock); /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). + * A concurrent lookup could check timer::it_signal lockless. It + * will reevaluate with timer::it_lock held and observe the NULL. */ - timer->it_signal = NULL; + WRITE_ONCE(timer->it_signal, NULL); unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); -- cgit v1.2.3 From 72786ff23d5acb7bf3e2535831b2f1dc55c7f44e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:06 +0200 Subject: posix-timers: Set k_itimer:: It_signal to NULL on exit() Technically it's not required to set k_itimer::it_signal to NULL on exit() because there is no other thread anymore which could lookup the timer concurrently. Set it to NULL for consistency sake and add a comment to that effect. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.196462644@linutronix.de --- kernel/time/posix-timers.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index de3fca8dae55..c1b77c597f5f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1101,6 +1101,14 @@ retry_delete: } list_del(&timer->list); + /* + * Setting timer::it_signal to NULL is technically not required + * here as nothing can access the timer anymore legitimately via + * the hash table. Set it to NULL nevertheless so that all deletion + * paths are consistent. + */ + WRITE_ONCE(timer->it_signal, NULL); + spin_unlock_irqrestore(&timer->it_lock, flags); release_posix_timer(timer, IT_ID_SET); } -- cgit v1.2.3 From 11fbe6cd41210c7b5173257158a22e11e225622d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:08 +0200 Subject: posix-timers: Remove pointless irqsafe from hash_lock All usage of hash_lock is in thread context. No point in using spin_lock_irqsave()/irqrestore() for a single usage site. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.249063953@linutronix.de --- kernel/time/posix-timers.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index c1b77c597f5f..ed7d260f51ef 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -471,10 +471,9 @@ static void k_itimer_rcu_free(struct rcu_head *head) static void release_posix_timer(struct k_itimer *tmr, int it_id_set) { if (it_id_set) { - unsigned long flags; - spin_lock_irqsave(&hash_lock, flags); + spin_lock(&hash_lock, flags); hlist_del_rcu(&tmr->t_hash); - spin_unlock_irqrestore(&hash_lock, flags); + spin_unlock(&hash_lock, flags); } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); -- cgit v1.2.3 From 8cc96ca2c75f6da59de41321797c87562703c9e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:09 +0200 Subject: posix-timers: Split release_posix_timers() release_posix_timers() is called for cleaning up both hashed and unhashed timers. The cases are differentiated by an argument and the usage is hideous. Seperate the actual free path out and use it for unhashed timers. Provide a function for hashed timers. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.301432503@linutronix.de --- kernel/time/posix-timers.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ed7d260f51ef..6ac693331df9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -466,20 +466,21 @@ static void k_itimer_rcu_free(struct rcu_head *head) kmem_cache_free(posix_timers_cache, tmr); } -#define IT_ID_SET 1 -#define IT_ID_NOT_SET 0 -static void release_posix_timer(struct k_itimer *tmr, int it_id_set) -{ - if (it_id_set) { - spin_lock(&hash_lock, flags); - hlist_del_rcu(&tmr->t_hash); - spin_unlock(&hash_lock, flags); - } +static void posix_timer_free(struct k_itimer *tmr) +{ put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); call_rcu(&tmr->rcu, k_itimer_rcu_free); } +static void posix_timer_unhash_and_free(struct k_itimer *tmr) +{ + spin_lock(&hash_lock); + hlist_del_rcu(&tmr->t_hash); + spin_unlock(&hash_lock); + posix_timer_free(tmr); +} + static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -493,7 +494,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; - int it_id_set = IT_ID_NOT_SET; if (!kc) return -EINVAL; @@ -513,11 +513,10 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, */ new_timer_id = posix_timer_add(new_timer); if (new_timer_id < 0) { - error = new_timer_id; - goto out; + posix_timer_free(new_timer); + return new_timer_id; } - it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; @@ -569,7 +568,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, * new_timer after the unlock call. */ out: - release_posix_timer(new_timer, it_id_set); + posix_timer_unhash_and_free(new_timer); return error; } @@ -1057,7 +1056,7 @@ retry_delete: WRITE_ONCE(timer->it_signal, NULL); unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); + posix_timer_unhash_and_free(timer); return 0; } @@ -1109,7 +1108,7 @@ retry_delete: WRITE_ONCE(timer->it_signal, NULL); spin_unlock_irqrestore(&timer->it_lock, flags); - release_posix_timer(timer, IT_ID_SET); + posix_timer_unhash_and_free(timer); } /* -- cgit v1.2.3 From 01679b5db7172b898be325ff272e10aebd412911 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:11 +0200 Subject: posix-timers: Document sys_clock_getres() correctly The decades old comment about Posix clock resolution is confusing at best. Remove it and add a proper explanation to sys_clock_getres(). Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.356427330@linutronix.de --- kernel/time/posix-timers.c | 81 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 6ac693331df9..1acdd04e5d26 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -67,14 +67,6 @@ static const struct k_clock clock_realtime, clock_monotonic; * to implement others. This structure defines the various * clocks. * - * RESOLUTION: Clock resolution is used to round up timer and interval - * times, NOT to report clock times, which are reported with as - * much resolution as the system can muster. In some cases this - * resolution may depend on the underlying clock hardware and - * may not be quantifiable until run time, and only then is the - * necessary code is written. The standard says we should say - * something about this issue in the documentation... - * * FUNCTIONS: The CLOCKs structure defines possible functions to * handle various clock functions. * @@ -1198,6 +1190,79 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, return err; } +/** + * sys_clock_getres - Get the resolution of a clock + * @which_clock: The clock to get the resolution for + * @tp: Pointer to a a user space timespec64 for storage + * + * POSIX defines: + * + * "The clock_getres() function shall return the resolution of any + * clock. Clock resolutions are implementation-defined and cannot be set by + * a process. If the argument res is not NULL, the resolution of the + * specified clock shall be stored in the location pointed to by res. If + * res is NULL, the clock resolution is not returned. If the time argument + * of clock_settime() is not a multiple of res, then the value is truncated + * to a multiple of res." + * + * Due to the various hardware constraints the real resolution can vary + * wildly and even change during runtime when the underlying devices are + * replaced. The kernel also can use hardware devices with different + * resolutions for reading the time and for arming timers. + * + * The kernel therefore deviates from the POSIX spec in various aspects: + * + * 1) The resolution returned to user space + * + * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI, + * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW + * the kernel differentiates only two cases: + * + * I) Low resolution mode: + * + * When high resolution timers are disabled at compile or runtime + * the resolution returned is nanoseconds per tick, which represents + * the precision at which timers expire. + * + * II) High resolution mode: + * + * When high resolution timers are enabled the resolution returned + * is always one nanosecond independent of the actual resolution of + * the underlying hardware devices. + * + * For CLOCK_*_ALARM the actual resolution depends on system + * state. When system is running the resolution is the same as the + * resolution of the other clocks. During suspend the actual + * resolution is the resolution of the underlying RTC device which + * might be way less precise than the clockevent device used during + * running state. + * + * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution + * returned is always nanoseconds per tick. + * + * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution + * returned is always one nanosecond under the assumption that the + * underlying scheduler clock has a better resolution than nanoseconds + * per tick. + * + * For dynamic POSIX clocks (PTP devices) the resolution returned is + * always one nanosecond. + * + * 2) Affect on sys_clock_settime() + * + * The kernel does not truncate the time which is handed in to + * sys_clock_settime(). The kernel internal timekeeping is always using + * nanoseconds precision independent of the clocksource device which is + * used to read the time from. The resolution of that device only + * affects the presicion of the time returned by sys_clock_gettime(). + * + * Returns: + * 0 Success. @tp contains the resolution + * -EINVAL @which_clock is not a valid clock ID + * -EFAULT Copying the resolution to @tp faulted + * -ENODEV Dynamic POSIX clock is not backed by a device + * -EOPNOTSUPP Dynamic POSIX clock does not support getres() + */ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { -- cgit v1.2.3 From a86e9284336729a8f79a65371eacd0c1c7fae142 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:12 +0200 Subject: posix-timers: Document common_clock_get() correctly Replace another confusing and inaccurate set of comments. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.409169321@linutronix.de --- kernel/time/posix-timers.c | 50 +++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1acdd04e5d26..e1af74ca80ab 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -660,20 +660,16 @@ static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) } /* - * Get the time remaining on a POSIX.1b interval timer. This function - * is ALWAYS called with spin_lock_irq on the timer, thus it must not - * mess with irq. + * Get the time remaining on a POSIX.1b interval timer. * - * We have a couple of messes to clean up here. First there is the case - * of a timer that has a requeue pending. These timers should appear to - * be in the timer list with an expiry as if we were to requeue them - * now. + * Two issues to handle here: * - * The second issue is the SIGEV_NONE timer which may be active but is - * not really ever put in the timer list (to save system resources). - * This timer may be expired, and if so, we will do it here. Otherwise - * it is the same as a requeue pending timer WRT to what we should - * report. + * 1) The timer has a requeue pending. The return value must appear as + * if the timer has been requeued right now. + * + * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued + * into the hrtimer queue and therefore never expired. Emulate expiry + * here taking #1 into account. */ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { @@ -689,8 +685,12 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) cur_setting->it_interval = ktime_to_timespec64(iv); } else if (!timr->it_active) { /* - * SIGEV_NONE oneshot timers are never queued. Check them - * below. + * SIGEV_NONE oneshot timers are never queued and therefore + * timr->it_active is always false. The check below + * vs. remaining time will handle this case. + * + * For all other timers there is nothing to update here, so + * return. */ if (!sig_none) return; @@ -699,18 +699,29 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) now = kc->clock_get_ktime(timr->it_clock); /* - * When a requeue is pending or this is a SIGEV_NONE timer move the - * expiry time forward by intervals, so expiry is > now. + * If this is an interval timer and either has requeue pending or + * is a SIGEV_NONE timer move the expiry time forward by intervals, + * so expiry is > now. */ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); - /* Return 0 only, when the timer is expired and not pending */ + /* + * As @now is retrieved before a possible timer_forward() and + * cannot be reevaluated by the compiler @remaining is based on the + * same @now value. Therefore @remaining is consistent vs. @now. + * + * Consequently all interval timers, i.e. @iv > 0, cannot have a + * remaining time <= 0 because timer_forward() guarantees to move + * them forward so that the next timer expiry is > @now. + */ if (remaining <= 0) { /* - * A single shot SIGEV_NONE timer must return 0, when - * it is expired ! + * A single shot SIGEV_NONE timer must return 0, when it is + * expired! Timers which have a real signal delivery mode + * must return a remaining time greater than 0 because the + * signal has not yet been delivered. */ if (!sig_none) cur_setting->it_value.tv_nsec = 1; @@ -719,7 +730,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) } } -/* Get the time remaining on a POSIX.1b interval timer. */ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { struct k_itimer *timr; -- cgit v1.2.3 From 65cade468dee537885b51897ba41ba05a4dcf6ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:14 +0200 Subject: posix-timers: Document sys_clock_getoverrun() Document the syscall in detail and with coherent sentences. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.462051641@linutronix.de --- kernel/time/posix-timers.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e1af74ca80ab..191ecf59cb98 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -783,14 +783,23 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, #endif -/* - * Get the number of overruns of a POSIX.1b interval timer. This is to - * be the overrun of the timer last delivered. At the same time we are - * accumulating overruns on the next timer. The overrun is frozen when - * the signal is delivered, either at the notify time (if the info block - * is not queued) or at the actual delivery time (as we are informed by - * the call back to posixtimer_rearm(). So all we need to do is - * to pick up the frozen overrun. +/** + * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer + * @timer_id: The timer ID which identifies the timer + * + * The "overrun count" of a timer is one plus the number of expiration + * intervals which have elapsed between the first expiry, which queues the + * signal and the actual signal delivery. On signal delivery the "overrun + * count" is calculated and cached, so it can be returned directly here. + * + * As this is relative to the last queued signal the returned overrun count + * is meaningless outside of the signal delivery path and even there it + * does not accurately reflect the current state when user space evaluates + * it. + * + * Returns: + * -EINVAL @timer_id is invalid + * 1..INT_MAX The number of overruns related to the last delivered signal */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { -- cgit v1.2.3 From 3561fcb402b7ab7fdb4c1746dae4995889506605 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:16 +0200 Subject: posix-timers: Document sys_clock_settime() permissions in place The documentation of sys_clock_settime() permissions is at a random place and mostly word salad. Remove it and add a concise comment into sys_clock_settime(). Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.514700292@linutronix.de --- kernel/time/posix-timers.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 191ecf59cb98..03ef6af1633b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -74,13 +74,6 @@ static const struct k_clock clock_realtime, clock_monotonic; * following: 1.) The k_itimer struct (sched.h) is used for * the timer. 2.) The list, it_lock, it_clock, it_id and * it_pid fields are not modified by timer code. - * - * Permissions: It is assumed that the clock_settime() function defined - * for each clock will take care of permission checks. Some - * clocks may be set able by any user (i.e. local process - * clocks) others not. Currently the only set able clock we - * have is CLOCK_REALTIME and its high res counter part, both of - * which we beg off on and pass to do_sys_settimeofday(). */ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); @@ -1159,6 +1152,10 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, if (get_timespec64(&new_tp, tp)) return -EFAULT; + /* + * Permission checks have to be done inside the clock specific + * setter callback. + */ return kc->clock_set(which_clock, &new_tp); } -- cgit v1.2.3 From 640fe745d7d4b6e47f3b455cb5de99a08c6b6d23 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:17 +0200 Subject: posix-timers: Document nanosleep() details The descriptions for common_nsleep() is wrong and common_nsleep_timens() lacks any form of comment. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.567072835@linutronix.de --- kernel/time/posix-timers.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 03ef6af1633b..54adb4cfd4f2 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1370,7 +1370,7 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, #endif /* - * nanosleep for monotonic and realtime clocks + * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI */ static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) @@ -1382,8 +1382,13 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } +/* + * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME + * + * Absolute nanosleeps for these clocks are time-namespace adjusted. + */ static int common_nsleep_timens(const clockid_t which_clock, int flags, - const struct timespec64 *rqtp) + const struct timespec64 *rqtp) { ktime_t texp = timespec64_to_ktime(*rqtp); -- cgit v1.2.3 From 52f090b164b59c06a4da87c0599424809a6bba16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:19 +0200 Subject: posix-timers: Add proper comments in do_timer_create() The comment about timer lifetime at the end of the function is misplaced and uncomprehensible. Make it understandable and put it at the right place. Add a new comment about the visibility of the new timer ID to user space. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.619897296@linutronix.de --- kernel/time/posix-timers.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 54adb4cfd4f2..20d3b991ec49 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -529,12 +529,17 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { + if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; goto out; } - + /* + * After succesful copy out, the timer ID is visible to user space + * now but not yet valid because new_timer::signal is still NULL. + * + * Complete the initialization with the clock specific create + * callback. + */ error = kc->timer_create(new_timer); if (error) goto out; @@ -544,14 +549,11 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, WRITE_ONCE(new_timer->it_signal, current->signal); list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); - - return 0; /* - * In the case of the timer belonging to another task, after - * the task is unlocked, the timer is owned by the other task - * and may cease to exist at any time. Don't use or modify - * new_timer after the unlock call. + * After unlocking sighand::siglock @new_timer is subject to + * concurrent removal and cannot be touched anymore */ + return 0; out: posix_timer_unhash_and_free(new_timer); return error; -- cgit v1.2.3 From c575689d66378611bb04137b96e67ebb3ac8482b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:20 +0200 Subject: posix-timers: Comment SIGEV_THREAD_ID properly Replace the word salad. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.672220780@linutronix.de --- kernel/time/posix-timers.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 20d3b991ec49..c8b0f52e5eb6 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -53,12 +53,9 @@ static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; -/* - * we assume that the new SIGEV_THREAD_ID shares no bits with the other - * SIGEV values. Here we put out an error if this assumption fails. - */ +/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -- cgit v1.2.3 From 02972d7955341b711d4c392f14faa9f9cd69d551 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:22 +0200 Subject: posix-timers: Clarify posix_timer_rearm() comment Yet another incomprehensible piece of art. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.724863461@linutronix.de --- kernel/time/posix-timers.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index c8b0f52e5eb6..d8d2169408f4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -275,15 +275,9 @@ static void common_hrtimer_rearm(struct k_itimer *timr) } /* - * This function is exported for use by the signal deliver code. It is - * called just prior to the info block being released and passes that - * block to us. It's function is to update the overrun entry AND to - * restart the timer. It should only be called if the timer is to be - * restarted (i.e. we have flagged this in the sys_private entry of the - * info block). - * - * To protect against the timer going away while the interrupt is queued, - * we require that the it_requeue_pending flag be set. + * This function is called from the signal delivery code if + * info->si_sys_private is not zero, which indicates that the timer has to + * be rearmed. Restart the timer and update info::si_overrun. */ void posixtimer_rearm(struct kernel_siginfo *info) { -- cgit v1.2.3 From 84999b8bdb4969816c7bb7c14c3a55ed42aa4b94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Jun 2023 21:07:37 +0200 Subject: posix-timers: Clarify posix_timer_fn() comments Make the issues vs. SIG_IGN understandable and remove the 15 years old promise that a proper solution is already on the horizon. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/874jnrdmrq.ffs@tglx --- kernel/time/posix-timers.c | 62 ++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d8d2169408f4..ae8799e97fcd 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -326,11 +326,11 @@ int posix_timer_event(struct k_itimer *timr, int si_private) } /* - * This function gets called when a POSIX.1b interval timer expires. It - * is used as a callback from the kernel internal timer. The - * run_timer_list code ALWAYS calls with interrupts on. - - * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + * This function gets called when a POSIX.1b interval timer expires from + * the HRTIMER interrupt (soft interrupt on RT kernels). + * + * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI + * based timers. */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { @@ -348,9 +348,10 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) if (posix_timer_event(timr, si_private)) { /* - * signal was not sent because of sig_ignor - * we will not get a call back to restart it AND - * it should be restarted. + * The signal was not queued due to SIG_IGN. As a + * consequence the timer is not going to be rearmed from + * the signal delivery path. But as a real signal handler + * can be installed later the timer must be rearmed here. */ if (timr->it_interval != 0) { ktime_t now = hrtimer_cb_get_time(timer); @@ -359,34 +360,35 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * FIXME: What we really want, is to stop this * timer completely and restart it in case the * SIG_IGN is removed. This is a non trivial - * change which involves sighand locking - * (sigh !), which we don't want to do late in - * the release cycle. + * change to the signal handling code. + * + * For now let timers with an interval less than a + * jiffie expire every jiffie and recheck for a + * valid signal handler. + * + * This avoids interrupt starvation in case of a + * very small interval, which would expire the + * timer immediately again. * - * For now we just let timers with an interval - * less than a jiffie expire every jiffie to - * avoid softirq starvation in case of SIG_IGN - * and a very small interval, which would put - * the timer right back on the softirq pending - * list. By moving now ahead of time we trick - * hrtimer_forward() to expire the timer - * later, while we still maintain the overrun - * accuracy, but have some inconsistency in - * the timer_gettime() case. This is at least - * better than a starved softirq. A more - * complex fix which solves also another related - * inconsistency is already in the pipeline. + * Moving now ahead of time by one jiffie tricks + * hrtimer_forward() to expire the timer later, + * while it still maintains the overrun accuracy + * for the price of a slight inconsistency in the + * timer_gettime() case. This is at least better + * than a timer storm. + * + * Only required when high resolution timers are + * enabled as the periodic tick based timers are + * automatically aligned to the next tick. */ -#ifdef CONFIG_HIGH_RES_TIMERS - { - ktime_t kj = NSEC_PER_SEC / HZ; + if (IS_ENABLED(CONFIG_HIGHRES_TIMERS)) { + ktime_t kj = TICK_NSEC; if (timr->it_interval < kj) now = ktime_add(now, kj); } -#endif - timr->it_overrun += hrtimer_forward(timer, now, - timr->it_interval); + + timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; timr->it_active = 1; -- cgit v1.2.3 From 200dbd6d14e6a3b22162d78b4f7a253426dba7ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:25 +0200 Subject: posix-timers: Remove pointless comments Documenting the obvious is just consuming space for no value. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.832240451@linutronix.de --- kernel/time/posix-timers.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ae8799e97fcd..d357728b7f00 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -59,19 +59,6 @@ static const struct k_clock clock_realtime, clock_monotonic; #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -/* - * CLOCKs: The POSIX standard calls for a couple of clocks and allows us - * to implement others. This structure defines the various - * clocks. - * - * FUNCTIONS: The CLOCKs structure defines possible functions to - * handle various clock functions. - * - * The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for - * the timer. 2.) The list, it_lock, it_clock, it_id and - * it_pid fields are not modified by timer code. - */ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); #define lock_timer(tid, flags) \ @@ -141,7 +128,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) spin_unlock_irqrestore(&timr->it_lock, flags); } -/* Get clock_realtime */ static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); @@ -153,7 +139,6 @@ static ktime_t posix_get_realtime_ktime(clockid_t which_clock) return ktime_get_real(); } -/* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) { @@ -166,9 +151,6 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, return do_adjtimex(t); } -/* - * Get monotonic time for posix timers - */ static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); @@ -181,9 +163,6 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) return ktime_get(); } -/* - * Get monotonic-raw time for posix timers - */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); @@ -191,7 +170,6 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) return 0; } - static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_real_ts64(tp); @@ -242,9 +220,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } -/* - * Initialize everything, well, just everything in Posix clocks/timers ;) - */ static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", -- cgit v1.2.3 From b96ce4931fcd13d73e32c62c2df3fa8f9f467e33 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Apr 2023 20:49:27 +0200 Subject: posix-timers: Polish coding style in a few places Make it consistent with the TIP tree documentation. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20230425183313.888493625@linutronix.de --- kernel/time/posix-timers.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d357728b7f00..e3cddd5f3c7f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -309,10 +309,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { + enum hrtimer_restart ret = HRTIMER_NORESTART; struct k_itimer *timr; unsigned long flags; int si_private = 0; - enum hrtimer_restart ret = HRTIMER_NORESTART; timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); @@ -400,8 +400,8 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer * alloc_posix_timer(void) { - struct k_itimer *tmr; - tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + if (!tmr) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { @@ -695,8 +695,8 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - struct k_itimer *timr; const struct k_clock *kc; + struct k_itimer *timr; unsigned long flags; int ret = 0; @@ -767,8 +767,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { struct k_itimer *timr; - int overrun; unsigned long flags; + int overrun; timr = lock_timer(timer_id, &flags); if (!timr) @@ -941,8 +941,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct __kernel_itimerspec __user *, new_setting, struct __kernel_itimerspec __user *, old_setting) { - struct itimerspec64 new_spec, old_spec; - struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; + struct itimerspec64 new_spec, old_spec, *rtn; int error = 0; if (!new_setting) @@ -951,6 +950,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; + rtn = old_setting ? &old_spec : NULL; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { if (put_itimerspec64(&old_spec, old_setting)) -- cgit v1.2.3 From b9a40f24d8ea86f89f1299a3f2dccd601febe3e5 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 9 Jun 2023 11:46:43 +0200 Subject: posix-timers: Refer properly to CONFIG_HIGH_RES_TIMERS Commit c78f261e5dcb ("posix-timers: Clarify posix_timer_fn() comments") turns an ifdef CONFIG_HIGH_RES_TIMERS into an conditional on "IS_ENABLED(CONFIG_HIGHRES_TIMERS)"; note that the new conditional refers to "HIGHRES_TIMERS" not "HIGH_RES_TIMERS" as before. Fix this typo introduced in that refactoring. Fixes: c78f261e5dcb ("posix-timers: Clarify posix_timer_fn() comments") Signed-off-by: Lukas Bulwahn Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230609094643.26253-1-lukas.bulwahn@gmail.com --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e3cddd5f3c7f..b924f0f096fa 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -356,7 +356,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * enabled as the periodic tick based timers are * automatically aligned to the next tick. */ - if (IS_ENABLED(CONFIG_HIGHRES_TIMERS)) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) { ktime_t kj = TICK_NSEC; if (timr->it_interval < kj) -- cgit v1.2.3 From 986af8dc5af834071a8a7afdf2c3b7a57d562df2 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Sat, 10 Jun 2023 02:28:56 +0800 Subject: alarmtimer: Remove unnecessary initialization of variable 'ret' ret is assigned before checked, so it does not need to initialize the variable Signed-off-by: Li zeming Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230609182856.4660-1-zeming@nfschina.com --- kernel/time/alarmtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 82b28ab0f328..09c9cde06f20 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -847,7 +847,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, struct restart_block *restart = ¤t->restart_block; struct alarm alarm; ktime_t exp; - int ret = 0; + int ret; if (!alarmtimer_get_rtcdev()) return -EOPNOTSUPP; -- cgit v1.2.3 From fc4b4d96f793c827a5abf23df3e793592dbd38ce Mon Sep 17 00:00:00 2001 From: Li zeming Date: Sat, 10 Jun 2023 02:20:59 +0800 Subject: alarmtimer: Remove unnecessary (void *) cast Pointers of type void * do not require a type cast when they are assigned to a real pointer. Signed-off-by: Li zeming Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230609182059.4509-1-zeming@nfschina.com --- kernel/time/alarmtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 09c9cde06f20..8d9f13d847f0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -751,7 +751,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) { - struct task_struct *task = (struct task_struct *)alarm->data; + struct task_struct *task = alarm->data; alarm->data = NULL; if (task) -- cgit v1.2.3 From a7e282c77785c7eabf98836431b1f029481085ad Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Fri, 5 May 2023 00:12:53 +0800 Subject: tick/rcu: Fix bogus ratelimit condition The ratelimit logic in report_idle_softirq() is broken because the exit condition is always true: static int ratelimit; if (ratelimit < 10) return false; ---> always returns here ratelimit++; ---> no chance to run Make it check for >= 10 instead. Fixes: 0345691b24c0 ("tick/rcu: Stop allowing RCU_SOFTIRQ in idle") Signed-off-by: Wen Yang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/tencent_5AAA3EEAB42095C9B7740BE62FBF9A67E007@qq.com --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52254679ec48..89055050d1ac 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1030,7 +1030,7 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit < 10) + if (ratelimit >= 10) return false; /* On RT, softirqs handling may be waiting on some lock */ -- cgit v1.2.3 From 1d28635abcf1914425d6516e641978011984c58a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 13 Jun 2023 15:35:30 -0700 Subject: bpf: Move unprivileged checks into map_create() and bpf_prog_load() Make each bpf() syscall command a bit more self-contained, making it easier to further enhance it. We move sysctl_unprivileged_bpf_disabled handling down to map_create() and bpf_prog_load(), two special commands in this regard. Also swap the order of checks, calling bpf_capable() only if sysctl_unprivileged_bpf_disabled is true, avoiding unnecessary audit messages. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20230613223533.3689589-2-andrii@kernel.org --- kernel/bpf/syscall.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 12955415d376..7c41a623f405 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1157,6 +1157,15 @@ static int map_create(union bpf_attr *attr) !node_online(numa_node))) return -EINVAL; + /* Intent here is for unprivileged_bpf_disabled to block BPF map + * creation for unprivileged users; other actions depend + * on fd availability and access to bpffs, so are dependent on + * object creation success. Even with unprivileged BPF disabled, + * capability checks are still carried out. + */ + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + return -EPERM; + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map = find_and_alloc_map(attr); if (IS_ERR(map)) @@ -2532,6 +2541,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); + /* Intent here is for unprivileged_bpf_disabled to block BPF program + * creation for unprivileged users; other actions depend + * on fd availability and access to bpffs, so are dependent on + * object creation success. Even with unprivileged BPF disabled, + * capability checks are still carried out for these + * and other operations. + */ + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + return -EPERM; + if (attr->insn_cnt == 0 || attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; @@ -5030,23 +5049,8 @@ out_prog_put: static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; - bool capable; int err; - capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; - - /* Intent here is for unprivileged_bpf_disabled to block key object - * creation commands for unprivileged users; other actions depend - * of fd availability and access to bpffs, so are dependent on - * object creation success. Capabilities are later verified for - * operations such as load and map create, so even with unprivileged - * BPF disabled, capability checks are still carried out for these - * and other operations. - */ - if (!capable && - (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) - return -EPERM; - err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; -- cgit v1.2.3 From 22db41226b679768df8f0a4ff5de8e58f625f45b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 13 Jun 2023 15:35:31 -0700 Subject: bpf: Inline map creation logic in map_create() function Currently find_and_alloc_map() performs two separate functions: some argument sanity checking and partial map creation workflow hanling. Neither of those functions are self-sufficient and are augmented by further checks and initialization logic in the caller (map_create() function). So unify all the sanity checks, permission checks, and creation and initialization logic in one linear piece of code in map_create() instead. This also make it easier to further enhance permission checks and keep them located in one place. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20230613223533.3689589-3-andrii@kernel.org --- kernel/bpf/syscall.c | 57 ++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7c41a623f405..6ef302709ab0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -109,37 +109,6 @@ const struct bpf_map_ops bpf_map_offload_ops = { .map_mem_usage = bpf_map_offload_map_mem_usage, }; -static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) -{ - const struct bpf_map_ops *ops; - u32 type = attr->map_type; - struct bpf_map *map; - int err; - - if (type >= ARRAY_SIZE(bpf_map_types)) - return ERR_PTR(-EINVAL); - type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); - ops = bpf_map_types[type]; - if (!ops) - return ERR_PTR(-EINVAL); - - if (ops->map_alloc_check) { - err = ops->map_alloc_check(attr); - if (err) - return ERR_PTR(err); - } - if (attr->map_ifindex) - ops = &bpf_map_offload_ops; - if (!ops->map_mem_usage) - return ERR_PTR(-EINVAL); - map = ops->map_alloc(attr); - if (IS_ERR(map)) - return map; - map->ops = ops; - map->map_type = type; - return map; -} - static void bpf_map_write_active_inc(struct bpf_map *map) { atomic64_inc(&map->writecnt); @@ -1127,7 +1096,9 @@ free_map_tab: /* called via syscall */ static int map_create(union bpf_attr *attr) { + const struct bpf_map_ops *ops; int numa_node = bpf_map_attr_numa_node(attr); + u32 map_type = attr->map_type; struct bpf_map *map; int f_flags; int err; @@ -1157,6 +1128,25 @@ static int map_create(union bpf_attr *attr) !node_online(numa_node))) return -EINVAL; + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map_type = attr->map_type; + if (map_type >= ARRAY_SIZE(bpf_map_types)) + return -EINVAL; + map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); + ops = bpf_map_types[map_type]; + if (!ops) + return -EINVAL; + + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return err; + } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; + if (!ops->map_mem_usage) + return -EINVAL; + /* Intent here is for unprivileged_bpf_disabled to block BPF map * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on @@ -1166,10 +1156,11 @@ static int map_create(union bpf_attr *attr) if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; - /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ - map = find_and_alloc_map(attr); + map = ops->map_alloc(attr); if (IS_ERR(map)) return PTR_ERR(map); + map->ops = ops; + map->map_type = map_type; err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); -- cgit v1.2.3 From 6c3eba1c5e283fd2bb1c076dbfcb47f569c3bfde Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 13 Jun 2023 15:35:32 -0700 Subject: bpf: Centralize permissions checks for all BPF map types This allows to do more centralized decisions later on, and generally makes it very explicit which maps are privileged and which are not (e.g., LRU_HASH and LRU_PERCPU_HASH, which are privileged HASH variants, as opposed to unprivileged HASH and HASH_PERCPU; now this is explicit and easy to verify). Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20230613223533.3689589-4-andrii@kernel.org --- kernel/bpf/bloom_filter.c | 3 --- kernel/bpf/bpf_local_storage.c | 3 --- kernel/bpf/bpf_struct_ops.c | 3 --- kernel/bpf/cpumap.c | 4 ---- kernel/bpf/devmap.c | 3 --- kernel/bpf/hashtab.c | 6 ------ kernel/bpf/lpm_trie.c | 3 --- kernel/bpf/queue_stack_maps.c | 4 ---- kernel/bpf/reuseport_array.c | 3 --- kernel/bpf/stackmap.c | 3 --- kernel/bpf/syscall.c | 47 ++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 47 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 540331b610a9..addf3dd57b59 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -86,9 +86,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct bpf_bloom_filter *bloom; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - if (attr->key_size != 0 || attr->value_size == 0 || attr->max_entries == 0 || attr->map_flags & ~BLOOM_CREATE_FLAG_MASK || diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 47d9948d768f..b5149cfce7d4 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -723,9 +723,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!bpf_capable()) - return -EPERM; - if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) return -E2BIG; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d3f0a4825fa6..116a0ce378ec 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -655,9 +655,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) const struct btf_type *t, *vt; struct bpf_map *map; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); if (!st_ops) return ERR_PTR(-ENOTSUPP); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8ec18faa74ac..8a33e8747a0e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -89,9 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || (value_size != offsetofend(struct bpf_cpumap_val, qsize) && diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 802692fa3905..49cc0b5671c6 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -160,9 +160,6 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) struct bpf_dtab *dtab; int err; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9901efee4339..56d3da7d0bc6 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -422,12 +422,6 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !bpf_capable()) - /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_BPF. - */ - return -EPERM; - if (zero_seed && !capable(CAP_SYS_ADMIN)) /* Guard against local DoS, and discourage production use. */ return -EPERM; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e0d3ddf2037a..17c7e7782a1f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -544,9 +544,6 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 601609164ef3..8d2ddcb7566b 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "percpu_freelist.h" @@ -46,9 +45,6 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!bpf_capable()) - return -EPERM; - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index cbf2d8d784b8..4b4f9670f1a9 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -151,9 +151,6 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b25fce425b2c..458bb80b14d5 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -74,9 +74,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ef302709ab0..658d1154f221 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1156,6 +1156,53 @@ static int map_create(union bpf_attr *attr) if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; + /* check privileged map type permissions */ + switch (map_type) { + case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: + case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: + /* unprivileged */ + break; + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: + case BPF_MAP_TYPE_BLOOM_FILTER: + case BPF_MAP_TYPE_LPM_TRIE: + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + case BPF_MAP_TYPE_STACK_TRACE: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_STRUCT_OPS: + case BPF_MAP_TYPE_CPUMAP: + if (!bpf_capable()) + return -EPERM; + break; + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_XSKMAP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + break; + default: + WARN(1, "unsupported map type %d", map_type); + return -EPERM; + } + map = ops->map_alloc(attr); if (IS_ERR(map)) return PTR_ERR(map); -- cgit v1.2.3 From 7f6719f7a8662a40afed367a685516f9f34e7bc2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 13 Jun 2023 15:35:33 -0700 Subject: bpf: Keep BPF_PROG_LOAD permission checks clear of validations Move out flags validation and license checks out of the permission checks. They were intermingled, which makes subsequent changes harder. Clean this up: perform straightforward flag validation upfront, and fetch and check license later, right where we use it. Also consolidate capabilities check in one block, right after basic attribute sanity checks. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20230613223533.3689589-5-andrii@kernel.org --- kernel/bpf/syscall.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 658d1154f221..a75c54b6f8a3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2550,7 +2550,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) struct btf *attach_btf = NULL; int err; char license[128]; - bool is_gpl; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; @@ -2569,16 +2568,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) !bpf_capable()) return -EPERM; - /* copy eBPF program license from user space */ - if (strncpy_from_bpfptr(license, - make_bpfptr(attr->license, uattr.is_kernel), - sizeof(license) - 1) < 0) - return -EFAULT; - license[sizeof(license) - 1] = 0; - - /* eBPF programs must be GPL compatible to use GPL-ed functions */ - is_gpl = license_is_gpl_compatible(license); - /* Intent here is for unprivileged_bpf_disabled to block BPF program * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on @@ -2671,12 +2660,20 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) goto free_prog_sec; + /* copy eBPF program license from user space */ + if (strncpy_from_bpfptr(license, + make_bpfptr(attr->license, uattr.is_kernel), + sizeof(license) - 1) < 0) + goto free_prog_sec; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; prog->orig_prog = NULL; prog->jited = 0; atomic64_set(&prog->aux->refcnt, 1); - prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); -- cgit v1.2.3 From a92cbb82c8d375d47fbaf0e1ad3fd4074a7cb156 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:53:23 -0700 Subject: perf/core: allow pte_offset_map() to fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In rare transient cases, not yet made possible, pte_offset_map() and pte_offet_map_lock() may not find a page table: handle appropriately. [hughd@google.com: __wp_page_copy_user(): don't call update_mmu_tlb() with NULL] Link: https://lkml.kernel.org/r/1a4db221-7872-3594-57ce-42369945ec8d@google.com Link: https://lkml.kernel.org/r/a194441b-63f3-adb6-5964-7ca3171ae7c2@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index db016e418931..174be710f3b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7490,6 +7490,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); +again: pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; @@ -7498,6 +7499,9 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pmd_leaf_size(pmd); ptep = pte_offset_map(&pmd, addr); + if (!ptep) + goto again; + pte = ptep_get_lockless(ptep); if (pte_present(pte)) size = pte_leaf_size(pte); -- cgit v1.2.3 From af2880ec44021d32cc72a5aa7c5d7d7beaa722d3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jun 2023 16:31:56 +0100 Subject: scatterlist: add dedicated config for DMA flags The DMA flags field will be useful for users beyond PCI P2P, so upgrade to its own dedicated config option. [catalin.marinas@arm.com: use #ifdef CONFIG_NEED_SG_DMA_FLAGS in scatterlist.h] [catalin.marinas@arm.com: update PCI_P2PDMA dma_flags comment in scatterlist.h] Link: https://lkml.kernel.org/r/20230612153201.554742-13-catalin.marinas@arm.com Signed-off-by: Robin Murphy Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/dma/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 6677d0e64d27..acc6f231259c 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -24,6 +24,9 @@ config DMA_OPS_BYPASS config ARCH_HAS_DMA_MAP_DIRECT bool +config NEED_SG_DMA_FLAGS + bool + config NEED_SG_DMA_LENGTH bool -- cgit v1.2.3 From cb147bbe22d2be9b49021c2e5dacdf2935745d1c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jun 2023 16:31:57 +0100 Subject: dma-mapping: name SG DMA flag helpers consistently sg_is_dma_bus_address() is inconsistent with the naming pattern of its corresponding setters and its own kerneldoc, so take the majority vote and rename it sg_dma_is_bus_address() (and fix up the missing underscores in the kerneldoc too). This gives us a nice clear pattern where SG DMA flags are SG_DMA_, and the helpers for acting on them are sg_dma__(). Link: https://lkml.kernel.org/r/20230612153201.554742-14-catalin.marinas@arm.com Signed-off-by: Robin Murphy Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Reviewed-by: Jerry Snitselaar Reviewed-by: Logan Gunthorpe Link: https://lore.kernel.org/r/fa2eca2862c7ffc41b50337abffb2dfd2864d3ea.1685036694.git.robin.murphy@arm.com Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 5595d1d5cdcc..d29cade048db 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -463,7 +463,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int i; for_each_sg(sgl, sg, nents, i) { - if (sg_is_dma_bus_address(sg)) + if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else dma_direct_unmap_page(dev, sg->dma_address, -- cgit v1.2.3 From 370645f41e6e2fdd2fb6f6982530b04612c9793c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:58 +0100 Subject: dma-mapping: force bouncing if the kmalloc() size is not cache-line-aligned For direct DMA, if the size is small enough to have originated from a kmalloc() cache below ARCH_DMA_MINALIGN, check its alignment against dma_get_cache_alignment() and bounce if necessary. For larger sizes, it is the responsibility of the DMA API caller to ensure proper alignment. At this point, the kmalloc() caches are properly aligned but this will change in a subsequent patch. Architectures can opt in by selecting DMA_BOUNCE_UNALIGNED_KMALLOC. Link: https://lkml.kernel.org/r/20230612153201.554742-15-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Reviewed-by: Robin Murphy Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/dma/Kconfig | 4 ++++ kernel/dma/direct.h | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index acc6f231259c..abea1823fe21 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -90,6 +90,10 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +config DMA_BOUNCE_UNALIGNED_KMALLOC + bool + depends on SWIOTLB + config DMA_RESTRICTED_POOL bool "DMA Restricted Pool" depends on OF && OF_RESERVED_MEM && SWIOTLB diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index e38ffc5e6bdd..97ec892ea0b5 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -94,7 +94,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, return swiotlb_map(dev, phys, size, dir, attrs); } - if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (unlikely(!dma_capable(dev, dma_addr, size, true)) || + dma_kmalloc_needs_bounce(dev, size, dir)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; if (is_swiotlb_active(dev)) -- cgit v1.2.3 From c33c794828f21217f72ce6fc140e0d34e0d56bff Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 12 Jun 2023 16:15:45 +0100 Subject: mm: ptep_get() conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexander Shishkin Cc: Alex Williamson Cc: Al Viro Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christian Brauner Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dave Airlie Cc: Dimitri Sivanich Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Jason Gunthorpe Cc: Jérôme Glisse Cc: Jiri Olsa Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Namhyung Kim Cc: Naoya Horiguchi Cc: Oleksandr Tyshchenko Cc: Pavel Tatashin Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Cc: Yu Zhao Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 607d742caa61..f0ac5b874919 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush_notify(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, -- cgit v1.2.3 From 3efd33b75358185ac9f45cfc189e20712be3570b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 10 Jun 2023 18:28:58 +0800 Subject: kernel: pid_namespace: remove unused set_memfd_noexec_scope() This inline function is unused, remove it. Link: https://lkml.kernel.org/r/20230610102858.31488-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Cc: Jeff Xu Cc: Kees Cook Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- kernel/pid_sysctl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index d67a4d45bb42..b26e027fc9cd 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -52,7 +52,6 @@ static inline void register_pid_ns_sysctl_table_vm(void) } #else static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} -static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {} static inline void register_pid_ns_sysctl_table_vm(void) {} #endif -- cgit v1.2.3 From 9ec272c586b07d1abf73438524bd12b1df9c5f9b Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 26 May 2023 18:41:31 -0700 Subject: watchdog/hardlockup: keep kernel.nmi_watchdog sysctl as 0444 if probe fails Patch series "watchdog: Cleanup / fixes after buddy series v5 reviews". This patch series attempts to finish resolving the feedback received from Petr Mladek on the v5 series I posted. Probably the only thing that wasn't fully as clean as Petr requested was the Kconfig stuff. I couldn't find a better way to express it without a more major overhaul. In the very least, I renamed "NON_ARCH" to "PERF_OR_BUDDY" in the hopes that will make it marginally better. Nothing in this series is terribly critical and even the bugfixes are small. However, it does cleanup a few things that were pointed out in review. This patch (of 10): The permissions for the kernel.nmi_watchdog sysctl have always been set at compile time despite the fact that a watchdog can fail to probe. Let's fix this and set the permissions based on whether the hardlockup detector actually probed. Link: https://lkml.kernel.org/r/20230527014153.2793931-1-dianders@chromium.org Link: https://lkml.kernel.org/r/20230526184139.1.I0d75971cc52a7283f495aac0bd5c3041aadc734e@changeid Fixes: a994a3147e4c ("watchdog/hardlockup/perf: Implement init time detection of perf") Signed-off-by: Douglas Anderson Reported-by: Petr Mladek Closes: https://lore.kernel.org/r/ZHCn4hNxFpY5-9Ki@alley Reviewed-by: Petr Mladek Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 237990e8d345..4b9e31edb47f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -880,15 +880,6 @@ static struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = (void *)&sixty, }, - { - .procname = "nmi_watchdog", - .data = &watchdog_hardlockup_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { .procname = "watchdog_cpumask", .data = &watchdog_cpumask_bits, @@ -952,10 +943,28 @@ static struct ctl_table watchdog_sysctls[] = { {} }; +static struct ctl_table watchdog_hardlockup_sysctl[] = { + { + .procname = "nmi_watchdog", + .data = &watchdog_hardlockup_user_enabled, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + static void __init watchdog_sysctl_init(void) { register_sysctl_init("kernel", watchdog_sysctls); + + if (watchdog_hardlockup_available) + watchdog_hardlockup_sysctl[0].mode = 0644; + register_sysctl_init("kernel", watchdog_hardlockup_sysctl); } + #else #define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ @@ -1011,6 +1020,8 @@ static int __init lockup_detector_check(void) /* Make sure no work is pending. */ flush_work(&detector_work); + watchdog_sysctl_init(); + return 0; } @@ -1030,5 +1041,4 @@ void __init lockup_detector_init(void) allow_lockup_detector_init_retry = true; lockup_detector_setup(); - watchdog_sysctl_init(); } -- cgit v1.2.3 From 6426e8d1f27417834ea37e75a9ead832d1cf7713 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 26 May 2023 18:41:32 -0700 Subject: watchdog/hardlockup: HAVE_NMI_WATCHDOG must implement watchdog_hardlockup_probe() Right now there is one arch (sparc64) that selects HAVE_NMI_WATCHDOG without selecting HAVE_HARDLOCKUP_DETECTOR_ARCH. Because of that one architecture, we have some special case code in the watchdog core to handle the fact that watchdog_hardlockup_probe() isn't implemented. Let's implement watchdog_hardlockup_probe() for sparc64 and get rid of the special case. As a side effect of doing this, code inspection tells us that we could fix a minor bug where the system won't properly realize that NMI watchdogs are disabled. Specifically, on powerpc if CONFIG_PPC_WATCHDOG is turned off the arch might still select CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH which selects CONFIG_HAVE_NMI_WATCHDOG. Since CONFIG_PPC_WATCHDOG was off then nothing will override the "weak" watchdog_hardlockup_probe() and we'll fallback to looking at CONFIG_HAVE_NMI_WATCHDOG. Link: https://lkml.kernel.org/r/20230526184139.2.Ic6ebbf307ca0efe91f08ce2c1eb4a037ba6b0700@changeid Signed-off-by: Douglas Anderson Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b9e31edb47f..62230f5b8878 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -217,19 +217,6 @@ void __weak watchdog_hardlockup_disable(unsigned int cpu) { } */ int __weak __init watchdog_hardlockup_probe(void) { - /* - * If CONFIG_HAVE_NMI_WATCHDOG is defined then an architecture - * is assumed to have the hard watchdog available and we return 0. - */ - if (IS_ENABLED(CONFIG_HAVE_NMI_WATCHDOG)) - return 0; - - /* - * Hardlockup detectors other than those using CONFIG_HAVE_NMI_WATCHDOG - * are required to implement a non-weak version of this probe function - * to tell whether they are available. If they don't override then - * we'll return -ENODEV. - */ return -ENODEV; } -- cgit v1.2.3 From 2711e4adef4fac2eeaee66e3c22a2f75ee86e7b3 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 26 May 2023 18:41:33 -0700 Subject: watchdog/hardlockup: don't use raw_cpu_ptr() in watchdog_hardlockup_kick() In the patch ("watchdog/hardlockup: add a "cpu" param to watchdog_hardlockup_check()") there was no reason to use raw_cpu_ptr(). Using this_cpu_ptr() works fine. Link: https://lkml.kernel.org/r/20230526184139.3.I660e103077dcc23bb29aaf2be09cb234e0495b2d@changeid Signed-off-by: Douglas Anderson Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 62230f5b8878..32dac8028753 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -133,7 +133,7 @@ static bool is_hardlockup(unsigned int cpu) static unsigned long watchdog_hardlockup_kick(void) { - return atomic_inc_return(raw_cpu_ptr(&hrtimer_interrupts)); + return atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts)); } void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) -- cgit v1.2.3 From 7a71d8e650b06833095e7a0d4206585e8585c00f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 26 May 2023 18:41:34 -0700 Subject: watchdog/hardlockup: in watchdog_hardlockup_check() use cpumask_copy() In the patch ("watchdog/hardlockup: add a "cpu" param to watchdog_hardlockup_check()") we started using a cpumask to keep track of which CPUs to backtrace. When setting up this cpumask, it's better to use cpumask_copy() than to just copy the structure directly. Fix this. Link: https://lkml.kernel.org/r/20230526184139.4.Iccee2d1ea19114dafb6553a854ea4d8ab2a3f25b@changeid Signed-off-by: Douglas Anderson Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 32dac8028753..85f4839b6faf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -154,7 +154,9 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); - struct cpumask backtrace_mask = *cpu_online_mask; + struct cpumask backtrace_mask; + + cpumask_copy(&backtrace_mask, cpu_online_mask); /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) -- cgit v1.2.3 From d3b62ace0f097f1d863fb6c41df3c61503e4ec9e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 26 May 2023 18:41:36 -0700 Subject: watchdog/buddy: cleanup how watchdog_buddy_check_hardlockup() is called In the patch ("watchdog/hardlockup: detect hard lockups using secondary (buddy) CPUs"), we added a call from the common watchdog.c file into the buddy. That call could be done more cleanly. Specifically: 1. If we move the call into watchdog_hardlockup_kick() then it keeps watchdog_timer_fn() simpler. 2. We don't need to pass an "unsigned long" to the buddy for the timer count. In the patch ("watchdog/hardlockup: add a "cpu" param to watchdog_hardlockup_check()") the count was changed to "atomic_t" which is backed by an int, so we should match types. Link: https://lkml.kernel.org/r/20230526184139.6.I006c7d958a1ea5c4e1e4dc44a25596d9bb5fd3ba@changeid Signed-off-by: Douglas Anderson Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog.c | 15 +++++++-------- kernel/watchdog_buddy.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 85f4839b6faf..6cc46b8e3d07 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -131,9 +131,12 @@ static bool is_hardlockup(unsigned int cpu) return false; } -static unsigned long watchdog_hardlockup_kick(void) +static void watchdog_hardlockup_kick(void) { - return atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts)); + int new_interrupts; + + new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts)); + watchdog_buddy_check_hardlockup(new_interrupts); } void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) @@ -195,7 +198,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) #else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ -static inline unsigned long watchdog_hardlockup_kick(void) { return 0; } +static inline void watchdog_hardlockup_kick(void) { } #endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ @@ -449,15 +452,11 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - unsigned long hrtimer_interrupts; if (!watchdog_enabled) return HRTIMER_NORESTART; - hrtimer_interrupts = watchdog_hardlockup_kick(); - - /* test for hardlockups */ - watchdog_buddy_check_hardlockup(hrtimer_interrupts); + watchdog_hardlockup_kick(); /* kick the softlockup detector */ if (completion_done(this_cpu_ptr(&softlockup_completion))) { diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c index fee45af2e5bd..3ffc5f2ede5a 100644 --- a/kernel/watchdog_buddy.c +++ b/kernel/watchdog_buddy.c @@ -72,7 +72,7 @@ void watchdog_hardlockup_disable(unsigned int cpu) cpumask_clear_cpu(cpu, &watchdog_cpus); } -void watchdog_buddy_check_hardlockup(unsigned long hrtimer_interrupts) +void watchdog_buddy_check_hardlockup(int hrtimer_interrupts) { unsigned int next_cpu; -- cgit v1.2.3 From 813efda23934edcad96343fc96727017378c3fe9 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 26 May 2023 18:41:37 -0700 Subject: watchdog/buddy: don't copy the cpumask in watchdog_next_cpu() There's no reason to make a copy of the "watchdog_cpus" locally in watchdog_next_cpu(). Making a copy wouldn't make things any more race free and we're just reading the value so there's no need for a copy. Link: https://lkml.kernel.org/r/20230526184139.7.If466f9a2b50884cbf6a1d8ad05525a2c17069407@changeid Signed-off-by: Douglas Anderson Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog_buddy.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c index 3ffc5f2ede5a..2ef88722c5e7 100644 --- a/kernel/watchdog_buddy.c +++ b/kernel/watchdog_buddy.c @@ -10,12 +10,11 @@ static cpumask_t __read_mostly watchdog_cpus; static unsigned int watchdog_next_cpu(unsigned int cpu) { - cpumask_t cpus = watchdog_cpus; unsigned int next_cpu; - next_cpu = cpumask_next(cpu, &cpus); + next_cpu = cpumask_next(cpu, &watchdog_cpus); if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(&cpus); + next_cpu = cpumask_first(&watchdog_cpus); if (next_cpu == cpu) return nr_cpu_ids; -- cgit v1.2.3 From 28168eca3297d68faa8a9433ec93cb6acf06d2f4 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 26 May 2023 18:41:39 -0700 Subject: watchdog/hardlockup: move SMP barriers from common code to buddy code It's been suggested that since the SMP barriers are only potentially useful for the buddy hardlockup detector, not the perf hardlockup detector, that the barriers belong in the buddy code. Let's move them and add clearer comments about why they're needed. Link: https://lkml.kernel.org/r/20230526184139.9.I5ab0a0eeb0bd52fb23f901d298c72fa5c396e22b@changeid Signed-off-by: Douglas Anderson Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog.c | 6 ------ kernel/watchdog_buddy.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6cc46b8e3d07..a351ab0c35eb 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -109,9 +109,6 @@ EXPORT_SYMBOL(arch_touch_nmi_watchdog); void watchdog_hardlockup_touch_cpu(unsigned int cpu) { per_cpu(watchdog_hardlockup_touched, cpu) = true; - - /* Match with smp_rmb() in watchdog_hardlockup_check() */ - smp_wmb(); } static bool is_hardlockup(unsigned int cpu) @@ -141,9 +138,6 @@ static void watchdog_hardlockup_kick(void) void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { - /* Match with smp_wmb() in watchdog_hardlockup_touch_cpu() */ - smp_rmb(); - if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c index 2ef88722c5e7..34dbfe091f4b 100644 --- a/kernel/watchdog_buddy.c +++ b/kernel/watchdog_buddy.c @@ -51,6 +51,13 @@ void watchdog_hardlockup_enable(unsigned int cpu) if (next_cpu < nr_cpu_ids) watchdog_hardlockup_touch_cpu(next_cpu); + /* + * Makes sure that watchdog is touched on this CPU before + * other CPUs could see it in watchdog_cpus. The counter + * part is in watchdog_buddy_check_hardlockup(). + */ + smp_wmb(); + cpumask_set_cpu(cpu, &watchdog_cpus); } @@ -68,6 +75,13 @@ void watchdog_hardlockup_disable(unsigned int cpu) if (next_cpu < nr_cpu_ids) watchdog_hardlockup_touch_cpu(next_cpu); + /* + * Makes sure that watchdog is touched on the next CPU before + * this CPU disappear in watchdog_cpus. The counter part is in + * watchdog_buddy_check_hardlockup(). + */ + smp_wmb(); + cpumask_clear_cpu(cpu, &watchdog_cpus); } @@ -88,5 +102,12 @@ void watchdog_buddy_check_hardlockup(int hrtimer_interrupts) if (next_cpu >= nr_cpu_ids) return; + /* + * Make sure that the watchdog was touched on next CPU when + * watchdog_next_cpu() returned another one because of + * a change in watchdog_hardlockup_enable()/disable(). + */ + smp_rmb(); + watchdog_hardlockup_check(next_cpu, NULL); } -- cgit v1.2.3 From a5fcc2367e223c45c78a882438c2b8e13fe0f580 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 16 Jun 2023 17:06:16 +0200 Subject: watchdog/hardlockup: make HAVE_NMI_WATCHDOG sparc64-specific There are several hardlockup detector implementations and several Kconfig values which allow selection and build of the preferred one. CONFIG_HARDLOCKUP_DETECTOR was introduced by the commit 23637d477c1f53acb ("lockup_detector: Introduce CONFIG_HARDLOCKUP_DETECTOR") in v2.6.36. It was a preparation step for introducing the new generic perf hardlockup detector. The existing arch-specific variants did not support the to-be-created generic build configurations, sysctl interface, etc. This distinction was made explicit by the commit 4a7863cc2eb5f98 ("x86, nmi_watchdog: Remove ARCH_HAS_NMI_WATCHDOG and rely on CONFIG_HARDLOCKUP_DETECTOR") in v2.6.38. CONFIG_HAVE_NMI_WATCHDOG was introduced by the commit d314d74c695f967e105 ("nmi watchdog: do not use cpp symbol in Kconfig") in v3.4-rc1. It replaced the above mentioned ARCH_HAS_NMI_WATCHDOG. At that time, it was still used by three architectures, namely blackfin, mn10300, and sparc. The support for blackfin and mn10300 architectures has been completely dropped some time ago. And sparc is the only architecture with the historic NMI watchdog at the moment. And the old sparc implementation is really special. It is always built on sparc64. It used to be always enabled until the commit 7a5c8b57cec93196b ("sparc: implement watchdog_nmi_enable and watchdog_nmi_disable") added in v4.10-rc1. There are only few locations where the sparc64 NMI watchdog interacts with the generic hardlockup detectors code: + implements arch_touch_nmi_watchdog() which is called from the generic touch_nmi_watchdog() + implements watchdog_hardlockup_enable()/disable() to support /proc/sys/kernel/nmi_watchdog + is always preferred over other generic watchdogs, see CONFIG_HARDLOCKUP_DETECTOR + includes asm/nmi.h into linux/nmi.h because some sparc-specific functions are needed in sparc-specific code which includes only linux/nmi.h. The situation became more complicated after the commit 05a4a95279311c3 ("kernel/watchdog: split up config options") and commit 2104180a53698df5 ("powerpc/64s: implement arch-specific hardlockup watchdog") in v4.13-rc1. They introduced HAVE_HARDLOCKUP_DETECTOR_ARCH. It was used for powerpc specific hardlockup detector. It was compatible with the perf one regarding the general boot, sysctl, and programming interfaces. HAVE_HARDLOCKUP_DETECTOR_ARCH was defined as a superset of HAVE_NMI_WATCHDOG. It made some sense because all arch-specific detectors had some common requirements, namely: + implemented arch_touch_nmi_watchdog() + included asm/nmi.h into linux/nmi.h + defined the default value for /proc/sys/kernel/nmi_watchdog But it actually has made things pretty complicated when the generic buddy hardlockup detector was added. Before the generic perf detector was newer supported together with an arch-specific one. But the buddy detector could work on any SMP system. It means that an architecture could support both the arch-specific and buddy detector. As a result, there are few tricky dependencies. For example, CONFIG_HARDLOCKUP_DETECTOR depends on: ((HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_BUDDY) && !HAVE_NMI_WATCHDOG) || HAVE_HARDLOCKUP_DETECTOR_ARCH The problem is that the very special sparc implementation is defined as: HAVE_NMI_WATCHDOG && !HAVE_HARDLOCKUP_DETECTOR_ARCH Another problem is that the meaning of HAVE_NMI_WATCHDOG is far from clear without reading understanding the history. Make the logic less tricky and more self-explanatory by making HAVE_NMI_WATCHDOG specific for the sparc64 implementation. And rename it to HAVE_HARDLOCKUP_DETECTOR_SPARC64. Note that HARDLOCKUP_DETECTOR_PREFER_BUDDY, HARDLOCKUP_DETECTOR_PERF, and HARDLOCKUP_DETECTOR_BUDDY may conflict only with HAVE_HARDLOCKUP_DETECTOR_ARCH. They depend on HARDLOCKUP_DETECTOR and it is not longer enabled when HAVE_NMI_WATCHDOG is set. Link: https://lkml.kernel.org/r/20230616150618.6073-5-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Douglas Anderson Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a351ab0c35eb..010fcc3ac141 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,7 +29,7 @@ static DEFINE_MUTEX(watchdog_mutex); -#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_SPARC64) # define WATCHDOG_HARDLOCKUP_DEFAULT 1 #else # define WATCHDOG_HARDLOCKUP_DEFAULT 0 -- cgit v1.2.3 From 47f4cb433923a08d81f1e5c065cb680215109db9 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 16 Jun 2023 17:06:17 +0200 Subject: watchdog/sparc64: define HARDLOCKUP_DETECTOR_SPARC64 The HAVE_ prefix means that the code could be enabled. Add another variable for HAVE_HARDLOCKUP_DETECTOR_SPARC64 without this prefix. It will be set when it should be built. It will make it compatible with the other hardlockup detectors. Before, it is far from obvious that the SPARC64 variant is actually used: $> make ARCH=sparc64 defconfig $> grep HARDLOCKUP_DETECTOR .config CONFIG_HAVE_HARDLOCKUP_DETECTOR_BUDDY=y CONFIG_HAVE_HARDLOCKUP_DETECTOR_SPARC64=y After, it is more clear: $> make ARCH=sparc64 defconfig $> grep HARDLOCKUP_DETECTOR .config CONFIG_HAVE_HARDLOCKUP_DETECTOR_BUDDY=y CONFIG_HAVE_HARDLOCKUP_DETECTOR_SPARC64=y CONFIG_HARDLOCKUP_DETECTOR_SPARC64=y Link: https://lkml.kernel.org/r/20230616150618.6073-6-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Douglas Anderson Cc: Christophe Leroy Cc: "David S. Miller" Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 010fcc3ac141..be38276a365f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,7 +29,7 @@ static DEFINE_MUTEX(watchdog_mutex); -#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_SPARC64) +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64) # define WATCHDOG_HARDLOCKUP_DEFAULT 1 #else # define WATCHDOG_HARDLOCKUP_DEFAULT 0 -- cgit v1.2.3 From f3d40e6545594c22733d091c5ec6b8ff345cbd57 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 20 Jun 2023 18:34:55 -0400 Subject: fgraph: Add declaration of "struct fgraph_ret_regs" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In final testing of: https://patchwork.kernel.org/project/linux-trace-kernel/patch/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn/ "function_graph: Support recording and printing the return value of function" The test failed due to a new warning found in the build: kernel/trace/fgraph.c:243:56: warning: ‘struct fgraph_ret_regs’ declared inside parameter list will not be visible outside of this definition or declaration Instead of asking to send another patch series, just add it and then apply the updates. Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 218cd95bf8e4..ea3d7bb235d3 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -236,6 +236,9 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; +/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ +struct fgraph_ret_regs; + /* * Send the trace to the ring-buffer. * @return the original return address. -- cgit v1.2.3 From a1be9ccc57f07d54278be34eed6bd679bc941c97 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Sat, 8 Apr 2023 05:42:15 -0700 Subject: function_graph: Support recording and printing the return value of function Analyzing system call failures with the function_graph tracer can be a time-consuming process, particularly when locating the kernel function that first returns an error in the trace logs. This change aims to simplify the process by recording the function return value to the 'retval' member of 'ftrace_graph_ret' and printing it when outputting the trace log. We have introduced new trace options: funcgraph-retval and funcgraph-retval-hex. The former controls whether to display the return value, while the latter controls the display format. Please note that even if a function's return type is void, a return value will still be printed. You can simply ignore it. This patch only establishes the fundamental infrastructure. Subsequent patches will make this feature available on some commonly used processor architectures. Here is an example: I attempted to attach the demo process to a cpu cgroup, but it failed: echo `pidof demo` > /sys/fs/cgroup/cpu/test/tasks -bash: echo: write error: Invalid argument The strace logs indicate that the write system call returned -EINVAL(-22): ... write(1, "273\n", 4) = -1 EINVAL (Invalid argument) ... To capture trace logs during a write system call, use the following commands: cd /sys/kernel/debug/tracing/ echo 0 > tracing_on echo > trace echo *sys_write > set_graph_function echo *spin* > set_graph_notrace echo *rcu* >> set_graph_notrace echo *alloc* >> set_graph_notrace echo preempt* >> set_graph_notrace echo kfree* >> set_graph_notrace echo $$ > set_ftrace_pid echo function_graph > current_tracer echo 1 > options/funcgraph-retval echo 0 > options/funcgraph-retval-hex echo 1 > tracing_on echo `pidof demo` > /sys/fs/cgroup/cpu/test/tasks echo 0 > tracing_on cat trace > ~/trace.log To locate the root cause, search for error code -22 directly in the file trace.log and identify the first function that returned -22. Once you have identified this function, examine its code to determine the root cause. For example, in the trace log below, cpu_cgroup_can_attach returned -22 first, so we can focus our analysis on this function to identify the root cause. ... 1) | cgroup_migrate() { 1) 0.651 us | cgroup_migrate_add_task(); /* = 0xffff93fcfd346c00 */ 1) | cgroup_migrate_execute() { 1) | cpu_cgroup_can_attach() { 1) | cgroup_taskset_first() { 1) 0.732 us | cgroup_taskset_next(); /* = 0xffff93fc8fb20000 */ 1) 1.232 us | } /* cgroup_taskset_first = 0xffff93fc8fb20000 */ 1) 0.380 us | sched_rt_can_attach(); /* = 0x0 */ 1) 2.335 us | } /* cpu_cgroup_can_attach = -22 */ 1) 4.369 us | } /* cgroup_migrate_execute = -22 */ 1) 7.143 us | } /* cgroup_migrate = -22 */ ... Link: https://lkml.kernel.org/r/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn Tested-by: Florian Kauer Acked-by: Masami Hiramatsu (Google) Signed-off-by: Donglin Peng Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 15 ++++++ kernel/trace/fgraph.c | 23 ++++++++- kernel/trace/trace.h | 2 + kernel/trace/trace_entries.h | 26 ++++++++++ kernel/trace/trace_functions_graph.c | 93 ++++++++++++++++++++++++++++++++---- 5 files changed, 148 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8cf97fa4a4b3..abe5c583bd59 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,6 +31,9 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst +config HAVE_FUNCTION_GRAPH_RETVAL + bool + config HAVE_DYNAMIC_FTRACE bool help @@ -227,6 +230,18 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config FUNCTION_GRAPH_RETVAL + bool "Kernel Function Graph Return Value" + depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on FUNCTION_GRAPH_TRACER + default n + help + Support recording and printing the function return value when + using function graph tracer. It can be helpful to locate functions + that return errors. This feature is off by default, and you can + enable it via the trace option funcgraph-retval. + See Documentation/trace/ftrace.rst + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index ea3d7bb235d3..cd2c35b1dd8f 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -243,12 +243,16 @@ struct fgraph_ret_regs; * Send the trace to the ring-buffer. * @return the original return address. */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, + unsigned long frame_pointer) { struct ftrace_graph_ret trace; unsigned long ret; ftrace_pop_return_trace(&trace, &ret, frame_pointer); +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + trace.retval = fgraph_ret_regs_return_value(ret_regs); +#endif trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); /* @@ -269,6 +273,23 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/* + * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can + * leave only ftrace_return_to_handler(ret_regs). + */ +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL +unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +{ + return __ftrace_return_to_handler(ret_regs, + fgraph_ret_regs_frame_pointer(ret_regs)); +} +#else +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + return __ftrace_return_to_handler(NULL, frame_pointer); +} +#endif + /** * ftrace_graph_get_ret_stack - return the entry of the shadow stack * @task: The task to read the shadow stack from diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..e6407a27d644 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -832,6 +832,8 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_TAIL 0x100 #define TRACE_GRAPH_SLEEP_TIME 0x200 #define TRACE_GRAPH_GRAPH_TIME 0x400 +#define TRACE_GRAPH_PRINT_RETVAL 0x800 +#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index cd41e863b51c..340b2fa98218 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -86,6 +86,30 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_packed( unsigned long, ret, func ) + __field_packed( unsigned long, ret, retval ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + ), + + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + (void *)__entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth, __entry->retval) +); + +#else + FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, @@ -105,6 +129,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __entry->depth) ); +#endif + /* * Context switch trace entry - which task (and prio) we switched from/to: * diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 203204cadf92..c35fbaab2a47 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -58,6 +58,12 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, /* Display function name after trailing } */ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + /* Display function return value ? */ + { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) }, + /* Display function return value in hexadecimal format ? */ + { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -619,6 +625,56 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, trace_seq_puts(s, "| "); } +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL + +static void print_graph_retval(struct trace_seq *s, unsigned long retval, + bool leaf, void *func, bool hex_format) +{ + unsigned long err_code = 0; + + if (retval == 0 || hex_format) + goto done; + + /* Check if the return value matches the negative format */ + if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && + (((u64)retval) >> 32) == 0) { + /* sign extension */ + err_code = (unsigned long)(s32)retval; + } else { + err_code = retval; + } + + if (!IS_ERR_VALUE(err_code)) + err_code = 0; + +done: + if (leaf) { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "%ps(); /* = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "%ps(); /* = %ld */\n", + func, err_code); + } else { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "} /* %ps = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "} /* %ps = %ld */\n", + func, err_code); + } +} + +#else + +#define __TRACE_GRAPH_PRINT_RETVAL 0 + +#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0) + +#endif + /* Case of a leaf function on its call entry */ static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, @@ -663,7 +719,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); - trace_seq_printf(s, "%ps();\n", (void *)call->func); + /* + * Write out the function return value if the option function-retval is + * enabled. + */ + if (flags & __TRACE_GRAPH_PRINT_RETVAL) + print_graph_retval(s, graph_ret->retval, true, (void *)call->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + else + trace_seq_printf(s, "%ps();\n", (void *)call->func); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -942,16 +1006,25 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, trace_seq_putc(s, ' '); /* - * If the return function does not have a matching entry, - * then the entry was lost. Instead of just printing - * the '}' and letting the user guess what function this - * belongs to, write out the function name. Always do - * that if the funcgraph-tail option is enabled. + * Always write out the function name and its return value if the + * function-retval option is enabled. */ - if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); - else - trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + if (flags & __TRACE_GRAPH_PRINT_RETVAL) { + print_graph_retval(s, trace->retval, false, (void *)trace->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + } else { + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. + */ + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + } /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) -- cgit v1.2.3 From a04de42460e271e532b43ad40e9fb07a9a09fe5c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 17 Jun 2023 15:48:09 +0800 Subject: cgroup: remove obsolete comment on cgroup_on_dfl() The debug feature is supported since commit 8cc38fa7fa31 ("cgroup: make debug an implicit controller on cgroup2"), update corresponding comment. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 04d1c0cde882..065bebb4af9b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -313,8 +313,6 @@ bool cgroup_ssid_enabled(int ssid) * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { -- cgit v1.2.3 From 36de5f303ca1bd6fce74815ef17ef3d8ff8737b5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 14 Jun 2023 19:18:22 -0600 Subject: cgroup: Avoid -Wstringop-overflow warnings Address the following -Wstringop-overflow warnings seen when built with ARM architecture and aspeed_g4_defconfig configuration (notice that under this configuration CGROUP_SUBSYS_COUNT == 0): kernel/cgroup/cgroup.c:1208:16: warning: 'find_existing_css_set' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:1258:15: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:6089:18: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:6153:18: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] These changes are based on commit d20d30ebb199 ("cgroup: Avoid compiler warnings with no subsystems"). Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 065bebb4af9b..1404e8e8abf7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1198,6 +1198,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, unsigned long key; int ssid; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches @@ -6017,6 +6020,9 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return -EINVAL; + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); -- cgit v1.2.3 From e973dfe9299441ad263efedf094897fbb31e6a87 Mon Sep 17 00:00:00 2001 From: LeiZhou-97 Date: Tue, 13 Jun 2023 18:59:29 +0800 Subject: cgroup/misc: Expose misc.current on cgroup v2 root Hello, This patch is to expose misc.current on cgroup v2 root for tracking how much of the resource has been consumed in total on the system. Most of the cloud infrastucture use cgroup to fetch the host information for scheduling purpose. Currently, the misc controller can be used by Intel TDX HKIDs and AMD SEV ASIDs, which are both used for creating encrypted VMs. Intel TDX and AMD SEV are mostly be used by the cloud providers for providing confidential VMs. In actual use of a server, these confidential VMs may be launched in different ways. For the cloud solution, there are kubvirt and coco (tracked by kubepods.slice); on host, they can be booted directly through qemu by end user (tracked by user.slice), etc. In this complex environment, when wanting to know how many resource is used in total it has to iterate through all existing slices to get the value of each misc.current and add them up to calculate the total number of consumed keys. So exposing misc.current to root cgroup tends to give much easier when calculates how much resource has been used in total, which helps to schedule and count resources for the cloud infrastucture. Signed-off-by: LeiZhou-97 Signed-off-by: Tejun Heo --- kernel/cgroup/misc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index fe3e8a0eb7ed..ae2f4dd47508 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -357,7 +357,6 @@ static struct cftype misc_cg_files[] = { { .name = "current", .seq_show = misc_cg_current_show, - .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "capacity", -- cgit v1.2.3 From ccaa4926c2264ca2a2fcad4b3511fe435d7d4d15 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 21 Jun 2023 08:59:28 +0100 Subject: hrtimer: Add missing sparse annotations to hrtimer locking Sparse warns about lock imbalance vs. the hrtimer_base lock due to missing sparse annotations: kernel/time/hrtimer.c:175:33: warning: context imbalance in 'lock_hrtimer_base' - wrong count at exit kernel/time/hrtimer.c:1301:28: warning: context imbalance in 'hrtimer_start_range_ns' - unexpected unlock kernel/time/hrtimer.c:1336:28: warning: context imbalance in 'hrtimer_try_to_cancel' - unexpected unlock kernel/time/hrtimer.c:1457:9: warning: context imbalance in '__hrtimer_get_remaining' - unexpected unlock Add the annotations to the relevant functions. Signed-off-by: Ben Dooks Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230621075928.394481-1-ben.dooks@codethink.co.uk --- kernel/time/hrtimer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e8c08292defc..238262e4aba7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -164,6 +164,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base) static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; @@ -280,6 +281,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base) static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; @@ -1013,6 +1015,7 @@ void hrtimers_resume_local(void) */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } -- cgit v1.2.3 From ddb5cdbafaaad6b99d7007ae1740403124502d03 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 12 Jun 2023 00:50:52 +0900 Subject: kbuild: generate KSYMTAB entries by modpost Commit 7b4537199a4a ("kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS") made modpost output CRCs in the same way whether the EXPORT_SYMBOL() is placed in *.c or *.S. For further cleanups, this commit applies a similar approach to the entire data structure of EXPORT_SYMBOL(). The EXPORT_SYMBOL() compilation is split into two stages. When a source file is compiled, EXPORT_SYMBOL() will be converted into a dummy symbol in the .export_symbol section. For example, EXPORT_SYMBOL(foo); EXPORT_SYMBOL_NS_GPL(bar, BAR_NAMESPACE); will be encoded into the following assembly code: .section ".export_symbol","a" __export_symbol_foo: .asciz "" /* license */ .asciz "" /* name space */ .balign 8 .quad foo /* symbol reference */ .previous .section ".export_symbol","a" __export_symbol_bar: .asciz "GPL" /* license */ .asciz "BAR_NAMESPACE" /* name space */ .balign 8 .quad bar /* symbol reference */ .previous They are mere markers to tell modpost the name, license, and namespace of the symbols. They will be dropped from the final vmlinux and modules because the *(.export_symbol) will go into /DISCARD/ in the linker script. Then, modpost extracts all the information about EXPORT_SYMBOL() from the .export_symbol section, and generates the final C code: KSYMTAB_FUNC(foo, "", ""); KSYMTAB_FUNC(bar, "_gpl", "BAR_NAMESPACE"); KSYMTAB_FUNC() (or KSYMTAB_DATA() if it is data) is expanded to struct kernel_symbol that will be linked to the vmlinux or a module. With this change, EXPORT_SYMBOL() works in the same way for *.c and *.S files, providing the following benefits. [1] Deprecate EXPORT_DATA_SYMBOL() In the old days, EXPORT_SYMBOL() was only available in C files. To export a symbol in *.S, EXPORT_SYMBOL() was placed in a separate *.c file. arch/arm/kernel/armksyms.c is one example written in the classic manner. Commit 22823ab419d8 ("EXPORT_SYMBOL() for asm") removed this limitation. Since then, EXPORT_SYMBOL() can be placed close to the symbol definition in *.S files. It was a nice improvement. However, as that commit mentioned, you need to use EXPORT_DATA_SYMBOL() for data objects on some architectures. In the new approach, modpost checks symbol's type (STT_FUNC or not), and outputs KSYMTAB_FUNC() or KSYMTAB_DATA() accordingly. There are only two users of EXPORT_DATA_SYMBOL: EXPORT_DATA_SYMBOL_GPL(empty_zero_page) (arch/ia64/kernel/head.S) EXPORT_DATA_SYMBOL(ia64_ivt) (arch/ia64/kernel/ivt.S) They are transformed as follows and output into .vmlinux.export.c KSYMTAB_DATA(empty_zero_page, "_gpl", ""); KSYMTAB_DATA(ia64_ivt, "", ""); The other EXPORT_SYMBOL users in ia64 assembly are output as KSYMTAB_FUNC(). EXPORT_DATA_SYMBOL() is now deprecated. [2] merge and There are two similar header implementations: include/linux/export.h for .c files include/asm-generic/export.h for .S files Ideally, the functionality should be consistent between them, but they tend to diverge. Commit 8651ec01daed ("module: add support for symbol namespaces.") did not support the namespace for *.S files. This commit shifts the essential implementation part to C, which supports EXPORT_SYMBOL_NS() for *.S files. and will remain as a wrapper of for a while. They will be removed after #include directives are all replaced with #include . [3] Implement CONFIG_TRIM_UNUSED_KSYMS in one-pass algorithm (by a later commit) When CONFIG_TRIM_UNUSED_KSYMS is enabled, Kbuild recursively traverses the directory tree to determine which EXPORT_SYMBOL to trim. If an EXPORT_SYMBOL turns out to be unused by anyone, Kbuild begins the second traverse, where some source files are recompiled with their EXPORT_SYMBOL() tuned into a no-op. We can do this better now; modpost can selectively emit KSYMTAB entries that are really used by modules. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers --- kernel/module/internal.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index dc7b0160c480..c8b7b4dcf782 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -32,6 +32,18 @@ /* Maximum number of characters written by module_flags() */ #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) +struct kernel_symbol { +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + int value_offset; + int name_offset; + int namespace_offset; +#else + unsigned long value; + const char *name; + const char *namespace; +#endif +}; + extern struct mutex module_mutex; extern struct list_head modules; -- cgit v1.2.3 From 83f74441bcb16c324b7bdba0ab4261a44cb1ac21 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 11 Jun 2023 15:00:29 +0200 Subject: ftrace: Show all functions with addresses in available_filter_functions_addrs Adding new available_filter_functions_addrs file that shows all available functions (same as available_filter_functions) together with addresses, like: # cat available_filter_functions_addrs | head ffffffff81000770 __traceiter_initcall_level ffffffff810007c0 __traceiter_initcall_start ffffffff81000810 __traceiter_initcall_finish ffffffff81000860 trace_initcall_finish_cb ... Note displayed address is the patch-site address and can differ from /proc/kallsyms address. It's useful to have address avilable for traceable symbols, so we don't need to allways cross check kallsyms with available_filter_functions (or the other way around) and have all the data in single file. For backwards compatibility reasons we can't change the existing available_filter_functions file output, but we need to add new file. The problem is that we need to do 2 passes: - through available_filter_functions and find out if the function is traceable - through /proc/kallsyms to get the address for traceable function Having available_filter_functions symbols together with addresses allow us to skip the kallsyms step and we are ok with the address in available_filter_functions_addr not being the function entry, because kprobe_multi uses fprobe and that handles both entry and patch-site address properly. We have 2 interfaces how to create kprobe_multi link: a) passing symbols to kernel 1) user gathers symbols and need to ensure that they are trace-able -> pass through available_filter_functions file 2) kernel takes those symbols and translates them to addresses through kallsyms api 3) addresses are passed to fprobe/ftrace through: register_fprobe_ips -> ftrace_set_filter_ips b) passing addresses to kernel 1) user gathers symbols and needs to ensure that they are trace-able -> pass through available_filter_functions file 2) user takes those symbols and translates them to addresses through /proc/kallsyms 3) addresses are passed to the kernel and kernel calls: register_fprobe_ips -> ftrace_set_filter_ips The new available_filter_functions_addrs file helps us with option b), because we can make 'b 1' and 'b 2' in one step - while filtering traceable functions, we get the address directly. Link: https://lore.kernel.org/linux-trace-kernel/20230611130029.1202298-1-jolsa@kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrii Nakryiko Tested-by: Jackie Liu # x86 Suggested-by: Steven Rostedt (Google) Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 764668467155..b24c573934af 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3861,6 +3861,9 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; + if (iter->flags & FTRACE_ITER_ADDRS) + seq_printf(m, "%lx ", rec->ip); + if (print_rec(m, rec->ip)) { /* This should only happen when a rec is disabled */ WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); @@ -3996,6 +3999,30 @@ ftrace_touched_open(struct inode *inode, struct file *file) return 0; } +static int +ftrace_avail_addrs_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ADDRS; + iter->ops = &global_ops; + + return 0; +} + /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -5916,6 +5943,13 @@ static const struct file_operations ftrace_touched_fops = { .release = seq_release_private, }; +static const struct file_operations ftrace_avail_addrs_fops = { + .open = ftrace_avail_addrs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -6377,6 +6411,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops); + trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_avail_addrs_fops); + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); -- cgit v1.2.3 From 4998e7fda149d2392ea6aa9879299d8a32019dbe Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 6 Jun 2023 17:12:25 +0200 Subject: tracing/osnoise: Switch from PF_NO_SETAFFINITY to migrate_disable Currently, osnoise/timerlat threads run with PF_NO_SETAFFINITY set. It works well, however, cgroups do not allow PF_NO_SETAFFINITY threads to be accepted, and this creates a limitation to osnoise/timerlat. To avoid this limitation, disable migration of the threads as soon as they start to run, and then clean the PF_NO_SETAFFINITY flag (still) used during thread creation. If for some reason a thread migration is requested, e.g., via sched_settafinity, the tracer thread will notice and exit. Link: https://lkml.kernel.org/r/8ba8bc9c15b3ea40cf73cf67a9bc061a264609f0.1686063934.git.bristot@kernel.org Cc: Juri Lelli Cc: William White Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e97e3fa5cbed..c265ec5f1726 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1545,6 +1545,39 @@ static void osnoise_sleep(void) } } +/* + * osnoise_migration_pending - checks if the task needs to migrate + * + * osnoise/timerlat threads are per-cpu. If there is a pending request to + * migrate the thread away from the current CPU, something bad has happened. + * Play the good citizen and leave. + * + * Returns 0 if it is safe to continue, 1 otherwise. + */ +static inline int osnoise_migration_pending(void) +{ + if (!current->migration_pending) + return 0; + + /* + * If migration is pending, there is a task waiting for the + * tracer to enable migration. The tracer does not allow migration, + * thus: taint and leave to unblock the blocked thread. + */ + osnoise_taint("migration requested to osnoise threads, leaving."); + + /* + * Unset this thread from the threads managed by the interface. + * The tracers are responsible for cleaning their env before + * exiting. + */ + mutex_lock(&interface_lock); + this_cpu_osn_var()->kthread = NULL; + mutex_unlock(&interface_lock); + + return 1; +} + /* * osnoise_main - The osnoise detection kernel thread * @@ -1553,12 +1586,29 @@ static void osnoise_sleep(void) */ static int osnoise_main(void *data) { + unsigned long flags; + + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); while (!kthread_should_stop()) { + if (osnoise_migration_pending()) + break; + run_osnoise(); osnoise_sleep(); } + migrate_enable(); return 0; } @@ -1706,6 +1756,7 @@ static int timerlat_main(void *data) struct timerlat_variables *tlat = this_cpu_tmr_var(); struct timerlat_sample s; struct sched_param sp; + unsigned long flags; u64 now, diff; /* @@ -1714,6 +1765,18 @@ static int timerlat_main(void *data) sp.sched_priority = DEFAULT_TIMERLAT_PRIO; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + tlat->count = 0; tlat->tracing_thread = false; @@ -1731,6 +1794,7 @@ static int timerlat_main(void *data) osn_var->sampling = 1; while (!kthread_should_stop()) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); diff = now - tlat->abs_period; @@ -1749,10 +1813,14 @@ static int timerlat_main(void *data) if (time_to_us(diff) >= osnoise_data.stop_tracing_total) osnoise_stop_tracing(); + if (osnoise_migration_pending()) + break; + wait_next_period(tlat); } hrtimer_cancel(&tlat->timer); + migrate_enable(); return 0; } #else /* CONFIG_TIMERLAT_TRACER */ -- cgit v1.2.3 From cb7ca871c883eed5132e106cda44b2b060e6f52e Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 6 Jun 2023 17:12:26 +0200 Subject: tracing/osnoise: Skip running osnoise if all instances are off In the case of all tracing instances being off, sleep for the entire period. Q: Why not kill all threads so? A: It is valid and useful to start the threads with tracing off. For example, rtla disables tracing, starts the tracer, applies the scheduling setup to the threads, e.g., sched priority and cgroup, and then begin tracing with all set. Skipping the period helps to speed up rtla setup and save the trace after a stop tracing. Link: https://lkml.kernel.org/r/aa4dd9b7e76fcb63901fe5407e15ec002b318599.1686063934.git.bristot@kernel.org Cc: Juri Lelli Cc: William White Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index c265ec5f1726..220172cb874d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1285,6 +1285,22 @@ static __always_inline void osnoise_stop_tracing(void) rcu_read_unlock(); } +/* + * osnoise_has_tracing_on - Check if there is at least one instance on + */ +static __always_inline int osnoise_has_tracing_on(void) +{ + struct osnoise_instance *inst; + int trace_is_on = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) + trace_is_on += tracer_tracing_is_on(inst->tr); + rcu_read_unlock(); + + return trace_is_on; +} + /* * notify_new_max_latency - Notify a new max latency via fsnotify interface. */ @@ -1517,13 +1533,16 @@ static struct cpumask save_cpumask; /* * osnoise_sleep - sleep until the next period */ -static void osnoise_sleep(void) +static void osnoise_sleep(bool skip_period) { u64 interval; ktime_t wake_time; mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + if (skip_period) + interval = osnoise_data.sample_period; + else + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; mutex_unlock(&interface_lock); /* @@ -1604,8 +1623,14 @@ static int osnoise_main(void *data) if (osnoise_migration_pending()) break; + /* skip a period if tracing is off on all instances */ + if (!osnoise_has_tracing_on()) { + osnoise_sleep(true); + continue; + } + run_osnoise(); - osnoise_sleep(); + osnoise_sleep(false); } migrate_enable(); -- cgit v1.2.3 From e88ed227f639ebcb31ed4e5b88756b47d904584b Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 6 Jun 2023 17:12:27 +0200 Subject: tracing/timerlat: Add user-space interface Going a step further, we propose a way to use any user-space workload as the task waiting for the timerlat timer. This is done via a per-CPU file named osnoise/cpu$id/timerlat_fd file. The tracef_fd allows a task to open at a time. When a task reads the file, the timerlat timer is armed for future osnoise/timerlat_period_us time. When the timer fires, it prints the IRQ latency and wakes up the user-space thread waiting in the timerlat_fd. The thread then starts to run, executes the timerlat measurement, prints the thread scheduling latency and returns to user-space. When the thread rereads the timerlat_fd, the tracer will print the user-ret(urn) latency, which is an additional metric. This additional metric is also traced by the tracer and can be used, for example of measuring the context switch overhead from kernel-to-user and user-to-kernel, or the response time for an arbitrary execution in user-space. The tracer supports one thread per CPU, the thread must be pinned to the CPU, and it cannot migrate while holding the timerlat_fd. The reason is that the tracer is per CPU (nothing prohibits the tracer from allowing migrations in the future). The tracer monitors the migration of the thread and disables the tracer if detected. The timerlat_fd is only available for opening/reading when timerlat tracer is enabled, and NO_OSNOISE_WORKLOAD is set. The simplest way to activate this feature from user-space is: -------------------------------- %< ----------------------------------- int main(void) { char buffer[1024]; int timerlat_fd; int retval; long cpu = 0; /* place in CPU 0 */ cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); if (sched_setaffinity(gettid(), sizeof(set), &set) == -1) return 1; snprintf(buffer, sizeof(buffer), "/sys/kernel/tracing/osnoise/per_cpu/cpu%ld/timerlat_fd", cpu); timerlat_fd = open(buffer, O_RDONLY); if (timerlat_fd < 0) { printf("error opening %s: %s\n", buffer, strerror(errno)); exit(1); } for (;;) { retval = read(timerlat_fd, buffer, 1024); if (retval < 0) break; } close(timerlat_fd); exit(0); } -------------------------------- >% ----------------------------------- When disabling timerlat, if there is a workload holding the timerlat_fd, the SIGKILL will be sent to the thread. Link: https://lkml.kernel.org/r/69fe66a863d2792ff4c3a149bf9e32e26468bb3a.1686063934.git.bristot@kernel.org Cc: Juri Lelli Cc: William White Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 378 ++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace_output.c | 4 +- 2 files changed, 377 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 220172cb874d..bd0d01d00fb9 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -181,6 +181,7 @@ struct osn_irq { #define IRQ_CONTEXT 0 #define THREAD_CONTEXT 1 +#define THREAD_URET 2 /* * sofirq runtime info. */ @@ -238,6 +239,7 @@ struct timerlat_variables { u64 abs_period; bool tracing_thread; u64 count; + bool uthread_migrate; }; static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); @@ -1181,6 +1183,78 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) osn_var->thread.arrival_time = 0; } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise_stop_exception - Stop tracing and the tracer. + */ +static __always_inline void osnoise_stop_exception(char *msg, int cpu) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +} + +/* + * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler + * + * his function is hooked to the sched:sched_migrate_task trace event, and monitors + * timerlat user-space thread migration. + */ +static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu) +{ + struct osnoise_variables *osn_var; + long cpu = task_cpu(p); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + if (osn_var->pid == p->pid && dest_cpu != cpu) { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user-thread migrated\n"); + osnoise_stop_exception("timerlat user-thread migrated", cpu); + } +} + +static int register_migration_monitor(void) +{ + int ret = 0; + + /* + * Timerlat thread migration check is only required when running timerlat in user-space. + * Thus, enable callback only if timerlat is set with no workload. + */ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + + return ret; +} + +static void unregister_migration_monitor(void) +{ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); +} +#else +static int register_migration_monitor(void) +{ + return 0; +} +static void unregister_migration_monitor(void) {} +#endif /* * trace_sched_switch - sched:sched_switch trace event handler * @@ -1204,7 +1278,7 @@ trace_sched_switch_callback(void *data, bool preempt, } /* - * hook_thread_events - Hook the insturmentation for thread noise + * hook_thread_events - Hook the instrumentation for thread noise * * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1217,11 +1291,19 @@ static int hook_thread_events(void) if (ret) return -EINVAL; + ret = register_migration_monitor(); + if (ret) + goto out_unreg; + return 0; + +out_unreg: + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + return -EINVAL; } /* - * unhook_thread_events - *nhook the insturmentation for thread noise + * unhook_thread_events - unhook the instrumentation for thread noise * * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1229,6 +1311,7 @@ static int hook_thread_events(void) static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + unregister_migration_monitor(); } /* @@ -1864,10 +1947,24 @@ static void stop_kthread(unsigned int cpu) kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; if (kthread) { - kthread_stop(kthread); + if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + kthread_stop(kthread); + } else { + /* + * This is a user thread waiting on the timerlat_fd. We need + * to close all users, and the best way to guarantee this is + * by killing the thread. NOTE: this is a purpose specific file. + */ + kill_pid(kthread->thread_pid, SIGKILL, 1); + put_task_struct(kthread); + } per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } else { + /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + /* + * This is set in the osnoise tracer case. + */ per_cpu(per_cpu_osnoise_var, cpu).sampling = false; barrier(); return; @@ -1912,7 +2009,6 @@ static int start_kthread(unsigned int cpu) barrier(); return 0; } - snprintf(comm, 24, "osnoise/%d", cpu); } @@ -1941,6 +2037,11 @@ static int start_per_cpu_kthreads(void) int retval = 0; int cpu; + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (timerlat_enabled()) + return 0; + } + cpus_read_lock(); /* * Run only on online CPUs in which osnoise is allowed to run. @@ -2281,6 +2382,223 @@ err_free: return err; } +#ifdef CONFIG_TIMERLAT_TRACER +static int timerlat_fd_open(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + long cpu = (long) inode->i_cdev; + + mutex_lock(&interface_lock); + + /* + * This file is accessible only if timerlat is enabled, and + * NO_OSNOISE_WORKLOAD is set. + */ + if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) { + mutex_unlock(&interface_lock); + return -EINVAL; + } + + migrate_disable(); + + osn_var = this_cpu_osn_var(); + + /* + * The osn_var->pid holds the single access to this file. + */ + if (osn_var->pid) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EBUSY; + } + + /* + * timerlat tracer is a per-cpu tracer. Check if the user-space too + * is pinned to a single CPU. The tracer laters monitor if the task + * migrates and then disables tracer if it does. However, it is + * worth doing this basic acceptance test to avoid obviusly wrong + * setup. + */ + if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EPERM; + } + + /* + * From now on, it is good to go. + */ + file->private_data = inode->i_cdev; + + get_task_struct(current); + + osn_var->kthread = current; + osn_var->pid = current->pid; + + /* + * Setup is done. + */ + mutex_unlock(&interface_lock); + + tlat = this_cpu_tmr_var(); + tlat->count = 0; + + migrate_enable(); + return 0; +}; + +/* + * timerlat_fd_read - Read function for "timerlat_fd" file + * @file: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error. + */ +static ssize_t +timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos) +{ + long cpu = (long) file->private_data; + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + struct timerlat_sample s; + s64 diff; + u64 now; + + migrate_disable(); + + tlat = this_cpu_tmr_var(); + + /* + * While in user-space, the thread is migratable. There is nothing + * we can do about it. + * So, if the thread is running on another CPU, stop the machinery. + */ + if (cpu == smp_processor_id()) { + if (tlat->uthread_migrate) { + migrate_enable(); + return -EINVAL; + } + } else { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user thread migrate\n"); + osnoise_stop_tracing(); + migrate_enable(); + return -EINVAL; + } + + osn_var = this_cpu_osn_var(); + + /* + * The timerlat in user-space runs in a different order: + * the read() starts from the execution of the previous occurrence, + * sleeping for the next occurrence. + * + * So, skip if we are entering on read() before the first wakeup + * from timerlat IRQ: + */ + if (likely(osn_var->sampling)) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_URET; + + trace_timerlat_sample(&s); + + notify_new_max_latency(diff); + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + } else { + tlat->tracing_thread = false; + tlat->kthread = current; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + + /* Annotate now to drift new period */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + osn_var->sampling = 1; + } + + /* wait for the next period */ + wait_next_period(tlat); + + /* This is the wakeup from this cycle */ + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); + notify_new_max_latency(diff); + osnoise_stop_tracing(); + } + } + +out: + migrate_enable(); + return 0; +} + +static int timerlat_fd_release(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat_var; + long cpu = (long) file->private_data; + + migrate_disable(); + mutex_lock(&interface_lock); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + + hrtimer_cancel(&tlat_var->timer); + memset(tlat_var, 0, sizeof(*tlat_var)); + + osn_var->sampling = 0; + osn_var->pid = 0; + + /* + * We are leaving, not being stopped... see stop_kthread(); + */ + if (osn_var->kthread) { + put_task_struct(osn_var->kthread); + osn_var->kthread = NULL; + } + + mutex_unlock(&interface_lock); + migrate_enable(); + return 0; +} +#endif + /* * osnoise/runtime_us: cannot be greater than the period. */ @@ -2344,6 +2662,13 @@ static struct trace_min_max_param timerlat_period = { .max = &timerlat_max_period, .min = &timerlat_min_period, }; + +static const struct file_operations timerlat_fd_fops = { + .open = timerlat_fd_open, + .read = timerlat_fd_read, + .release = timerlat_fd_release, + .llseek = generic_file_llseek, +}; #endif static const struct file_operations cpus_fops = { @@ -2381,18 +2706,63 @@ static int init_timerlat_stack_tracefs(struct dentry *top_dir) } #endif /* CONFIG_STACKTRACE */ +static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) +{ + struct dentry *timerlat_fd; + struct dentry *per_cpu; + struct dentry *cpu_dir; + char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */ + long cpu; + + /* + * Why not using tracing instance per_cpu/ dir? + * + * Because osnoise/timerlat have a single workload, having + * multiple files like these are wast of memory. + */ + per_cpu = tracefs_create_dir("per_cpu", top_dir); + if (!per_cpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + snprintf(cpu_str, 30, "cpu%ld", cpu); + cpu_dir = tracefs_create_dir(cpu_str, per_cpu); + if (!cpu_dir) + goto out_clean; + + timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ, + cpu_dir, NULL, &timerlat_fd_fops); + if (!timerlat_fd) + goto out_clean; + + /* Record the CPU */ + d_inode(timerlat_fd)->i_cdev = (void *)(cpu); + } + + return 0; + +out_clean: + tracefs_remove(per_cpu); + return -ENOMEM; +} + /* * init_timerlat_tracefs - A function to initialize the timerlat interface files */ static int init_timerlat_tracefs(struct dentry *top_dir) { struct dentry *tmp; + int retval; tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); + if (retval) + return retval; + return init_timerlat_stack_tracefs(top_dir); } #else /* CONFIG_TIMERLAT_TRACER */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 15f05faaae44..9f10c0071c4f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1446,6 +1446,8 @@ static struct trace_event trace_osnoise_event = { }; /* TRACE_TIMERLAT */ + +static char *timerlat_lat_context[] = {"irq", "thread", "user-ret"}; static enum print_line_t trace_timerlat_print(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -1458,7 +1460,7 @@ trace_timerlat_print(struct trace_iterator *iter, int flags, trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n", field->seqnum, - field->context ? "thread" : "irq", + timerlat_lat_context[field->context], field->timer_latency); return trace_handle_return(s); -- cgit v1.2.3 From 38638ffa6059049334b4d87bd4d85cf3418b5e27 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Tue, 13 Jun 2023 00:41:25 +0000 Subject: tracing/boot: Replace strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). Direct replacement is safe here since return value of -E2BIG is used to check for truncation instead of sizeof(dest). [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Link: https://lore.kernel.org/linux-trace-kernel/20230613004125.3539934-1-azeemshaikh38@gmail.com Cc: Masami Hiramatsu Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_boot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 778200dd8ede..5fe525f1b8cc 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) /* Common ftrace options */ xbc_node_for_each_array_value(node, "options", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { pr_err("String is too long: %s\n", p); continue; } @@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) const char *p; xbc_node_for_each_array_value(node, "events", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { pr_err("String is too long: %s\n", p); continue; } @@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, p = xbc_node_find_value(enode, "filter", NULL); if (p && *p != '\0') { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) pr_err("filter string is too long: %s\n", p); else if (apply_event_filter(file, buf) < 0) pr_err("Failed to apply filter: %s\n", buf); @@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); -- cgit v1.2.3 From 4dd595c34c4bb22c16a76206a18c13e4e194335d Mon Sep 17 00:00:00 2001 From: Sohil Mehta Date: Wed, 21 Jun 2023 22:36:00 +0000 Subject: syscalls: Remove file path comments from headers Source file locations for syscall definitions can change over a period of time. File paths in comments get stale and are hard to maintain long term. Also, their usefulness is questionable since it would be easier to locate a syscall definition using the SYSCALL_DEFINEx() macro. Remove all source file path comments from the syscall headers. Also, equalize the uneven line spacing (some of which is introduced due to the deletions). Signed-off-by: Sohil Mehta Signed-off-by: Arnd Bergmann --- kernel/sys_ni.c | 110 +------------------------------------------------------- 1 file changed, 1 insertion(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..f207236002e9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -51,99 +51,35 @@ COND_SYSCALL_COMPAT(io_pgetevents); COND_SYSCALL(io_uring_setup); COND_SYSCALL(io_uring_enter); COND_SYSCALL(io_uring_register); - -/* fs/xattr.c */ - -/* fs/dcache.c */ - -/* fs/cookies.c */ COND_SYSCALL(lookup_dcookie); COND_SYSCALL_COMPAT(lookup_dcookie); - -/* fs/eventfd.c */ COND_SYSCALL(eventfd2); - -/* fs/eventfd.c */ COND_SYSCALL(epoll_create1); COND_SYSCALL(epoll_ctl); COND_SYSCALL(epoll_pwait); COND_SYSCALL_COMPAT(epoll_pwait); COND_SYSCALL(epoll_pwait2); COND_SYSCALL_COMPAT(epoll_pwait2); - -/* fs/fcntl.c */ - -/* fs/inotify_user.c */ COND_SYSCALL(inotify_init1); COND_SYSCALL(inotify_add_watch); COND_SYSCALL(inotify_rm_watch); - -/* fs/ioctl.c */ - -/* fs/ioprio.c */ COND_SYSCALL(ioprio_set); COND_SYSCALL(ioprio_get); - -/* fs/locks.c */ COND_SYSCALL(flock); - -/* fs/namei.c */ - -/* fs/namespace.c */ - -/* fs/nfsctl.c */ - -/* fs/open.c */ - -/* fs/pipe.c */ - -/* fs/quota.c */ COND_SYSCALL(quotactl); COND_SYSCALL(quotactl_fd); - -/* fs/readdir.c */ - -/* fs/read_write.c */ - -/* fs/sendfile.c */ - -/* fs/select.c */ - -/* fs/signalfd.c */ COND_SYSCALL(signalfd4); COND_SYSCALL_COMPAT(signalfd4); - -/* fs/splice.c */ - -/* fs/stat.c */ - -/* fs/sync.c */ - -/* fs/timerfd.c */ COND_SYSCALL(timerfd_create); COND_SYSCALL(timerfd_settime); COND_SYSCALL(timerfd_settime32); COND_SYSCALL(timerfd_gettime); COND_SYSCALL(timerfd_gettime32); - -/* fs/utimes.c */ - -/* kernel/acct.c */ COND_SYSCALL(acct); - -/* kernel/capability.c */ COND_SYSCALL(capget); COND_SYSCALL(capset); - -/* kernel/exec_domain.c */ - -/* kernel/exit.c */ - -/* kernel/fork.c */ /* __ARCH_WANT_SYS_CLONE3 */ COND_SYSCALL(clone3); - -/* kernel/futex/syscalls.c */ COND_SYSCALL(futex); COND_SYSCALL(futex_time32); COND_SYSCALL(set_robust_list); @@ -151,29 +87,11 @@ COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list); COND_SYSCALL(futex_waitv); - -/* kernel/hrtimer.c */ - -/* kernel/itimer.c */ - -/* kernel/kexec.c */ COND_SYSCALL(kexec_load); COND_SYSCALL_COMPAT(kexec_load); - -/* kernel/module.c */ COND_SYSCALL(init_module); COND_SYSCALL(delete_module); - -/* kernel/posix-timers.c */ - -/* kernel/printk.c */ COND_SYSCALL(syslog); - -/* kernel/ptrace.c */ - -/* kernel/sched/core.c */ - -/* kernel/sys.c */ COND_SYSCALL(setregid); COND_SYSCALL(setgid); COND_SYSCALL(setreuid); @@ -186,12 +104,6 @@ COND_SYSCALL(setfsuid); COND_SYSCALL(setfsgid); COND_SYSCALL(setgroups); COND_SYSCALL(getgroups); - -/* kernel/time.c */ - -/* kernel/timer.c */ - -/* ipc/mqueue.c */ COND_SYSCALL(mq_open); COND_SYSCALL_COMPAT(mq_open); COND_SYSCALL(mq_unlink); @@ -203,8 +115,6 @@ COND_SYSCALL(mq_notify); COND_SYSCALL_COMPAT(mq_notify); COND_SYSCALL(mq_getsetattr); COND_SYSCALL_COMPAT(mq_getsetattr); - -/* ipc/msg.c */ COND_SYSCALL(msgget); COND_SYSCALL(old_msgctl); COND_SYSCALL(msgctl); @@ -214,8 +124,6 @@ COND_SYSCALL(msgrcv); COND_SYSCALL_COMPAT(msgrcv); COND_SYSCALL(msgsnd); COND_SYSCALL_COMPAT(msgsnd); - -/* ipc/sem.c */ COND_SYSCALL(semget); COND_SYSCALL(old_semctl); COND_SYSCALL(semctl); @@ -224,8 +132,6 @@ COND_SYSCALL_COMPAT(old_semctl); COND_SYSCALL(semtimedop); COND_SYSCALL(semtimedop_time32); COND_SYSCALL(semop); - -/* ipc/shm.c */ COND_SYSCALL(shmget); COND_SYSCALL(old_shmctl); COND_SYSCALL(shmctl); @@ -234,8 +140,6 @@ COND_SYSCALL_COMPAT(old_shmctl); COND_SYSCALL(shmat); COND_SYSCALL_COMPAT(shmat); COND_SYSCALL(shmdt); - -/* net/socket.c */ COND_SYSCALL(socket); COND_SYSCALL(socketpair); COND_SYSCALL(bind); @@ -256,30 +160,18 @@ COND_SYSCALL(sendmsg); COND_SYSCALL_COMPAT(sendmsg); COND_SYSCALL(recvmsg); COND_SYSCALL_COMPAT(recvmsg); - -/* mm/filemap.c */ - -/* mm/nommu.c, also with MMU */ COND_SYSCALL(mremap); - -/* security/keys/keyctl.c */ COND_SYSCALL(add_key); COND_SYSCALL(request_key); COND_SYSCALL(keyctl); COND_SYSCALL_COMPAT(keyctl); - -/* security/landlock/syscalls.c */ COND_SYSCALL(landlock_create_ruleset); COND_SYSCALL(landlock_add_rule); COND_SYSCALL(landlock_restrict_self); - -/* arch/example/kernel/sys_example.c */ - -/* mm/fadvise.c */ COND_SYSCALL(fadvise64_64); COND_SYSCALL_COMPAT(fadvise64_64); -/* mm/, CONFIG_MMU only */ +/* CONFIG_MMU only */ COND_SYSCALL(swapon); COND_SYSCALL(swapoff); COND_SYSCALL(mprotect); -- cgit v1.2.3 From 81621430c81bb7965c3d5807039bc2b5b3ec87ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 22 Jun 2023 08:51:14 -1000 Subject: Revert "cgroup: Avoid -Wstringop-overflow warnings" This reverts commit 36de5f303ca1bd6fce74815ef17ef3d8ff8737b5. The commit caused boot failures on some configurations due to cgroup hierarchies not being created at all. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1404e8e8abf7..065bebb4af9b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1198,9 +1198,6 @@ static struct css_set *find_css_set(struct css_set *old_cset, unsigned long key; int ssid; - if (!CGROUP_HAS_SUBSYS_CONFIG) - return NULL; - lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches @@ -6020,9 +6017,6 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - if (!CGROUP_HAS_SUBSYS_CONFIG) - return -EINVAL; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); -- cgit v1.2.3 From ed5f297802fca41d88fbfa6f9c13b218e7c6f5cb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 4 Jun 2023 11:29:00 +0900 Subject: tracing/probes: Fix to return NULL and keep using current argc Fix to return NULL and keep using current argc when there is $argN and the BTF is not available. Link: https://lore.kernel.org/all/168584574094.2056209.2694238431743782342.stgit@mhiramat.roam.corp.google.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306030940.Cej2JoUx-lkp@intel.com/ Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ba1c6e059b51..473e1c43bc57 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1273,7 +1273,8 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], trace_probe_log_err(0, NOSUP_BTFARG); return (const char **)params; } - return 0; + *new_argc = argc; + return NULL; } ctx->params = params; ctx->nr_params = nr_params; -- cgit v1.2.3 From 53431798f4bb60d214ae1ec4a79eefdd414f577b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 12 Jun 2023 20:58:57 +0900 Subject: tracing/probes: Fix tracepoint event with $arg* to fetch correct argument To hide the first dummy 'data' argument on the tracepoint probe events, the BTF argument array was modified (skip the first argument for tracepoint), but the '$arg*' meta argument parser missed that. Fix to increment the argument index if it is tracepoint probe. And decrement the index when searching the type of the argument. Link: https://lore.kernel.org/all/168657113778.3038017.12245893750241701312.stgit@mhiramat.roam.corp.google.com/ Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 473e1c43bc57..643aa3a51d5a 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -456,7 +456,10 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, if (name && !strcmp(name, varname)) { code->op = FETCH_OP_ARG; - code->param = i; + if (ctx->flags & TPARG_FL_TPOINT) + code->param = i + 1; + else + code->param = i; return 0; } } @@ -470,8 +473,11 @@ static const struct fetch_type *parse_btf_arg_type(int arg_idx, struct btf *btf = traceprobe_get_btf(); const char *typestr = NULL; - if (btf && ctx->params) + if (btf && ctx->params) { + if (ctx->flags & TPARG_FL_TPOINT) + arg_idx--; typestr = type_from_btf_id(btf, ctx->params[arg_idx].type); + } return find_fetch_type(typestr, ctx->flags); } -- cgit v1.2.3 From 4afc9a402aa3890885747b396c1adcd45f127665 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 8 Jun 2023 16:23:12 +0800 Subject: kernel/time/posix-stubs.c: remove duplicated include ./kernel/time/posix-stubs.c: linux/syscalls.h is included more than once. Link: https://lkml.kernel.org/r/20230608082312.123939-1-yang.lee@linux.alibaba.com Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=5463 Fixes: c1956519cd7e ("syscalls: add sys_ni_posix_timers prototype") Signed-off-by: Yang Li Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- kernel/time/posix-stubs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 39769b2d1005..828aeecbd1e8 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -16,7 +16,6 @@ #include #include #include -#include #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER /* Architectures may override SYS_NI and COMPAT_SYS_NI */ -- cgit v1.2.3 From 5f81018753dfd4989e33ece1f0cb6b8aae498b82 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 15 Jun 2023 13:52:36 +0200 Subject: fprobe: Release rethook after the ftrace_ops is unregistered While running bpf selftests it's possible to get following fault: general protection fault, probably for non-canonical address \ 0x6b6b6b6b6b6b6b6b: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC NOPTI ... Call Trace: fprobe_handler+0xc1/0x270 ? __pfx_bpf_testmod_init+0x10/0x10 ? __pfx_bpf_testmod_init+0x10/0x10 ? bpf_fentry_test1+0x5/0x10 ? bpf_fentry_test1+0x5/0x10 ? bpf_testmod_init+0x22/0x80 ? do_one_initcall+0x63/0x2e0 ? rcu_is_watching+0xd/0x40 ? kmalloc_trace+0xaf/0xc0 ? do_init_module+0x60/0x250 ? __do_sys_finit_module+0xac/0x120 ? do_syscall_64+0x37/0x90 ? entry_SYSCALL_64_after_hwframe+0x72/0xdc In unregister_fprobe function we can't release fp->rethook while it's possible there are some of its users still running on another cpu. Moving rethook_free call after fp->ops is unregistered with unregister_ftrace_function call. Link: https://lore.kernel.org/all/20230615115236.3476617-1-jolsa@kernel.org/ Fixes: 5b0ab78998e3 ("fprobe: Add exit_handler support") Cc: stable@vger.kernel.org Reviewed-by: Steven Rostedt (Google) Signed-off-by: Jiri Olsa Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 18d36842faf5..0121e8c0d54e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -364,19 +364,13 @@ int unregister_fprobe(struct fprobe *fp) fp->ops.saved_func != fprobe_kprobe_handler)) return -EINVAL; - /* - * rethook_free() starts disabling the rethook, but the rethook handlers - * may be running on other processors at this point. To make sure that all - * current running handlers are finished, call unregister_ftrace_function() - * after this. - */ - if (fp->rethook) - rethook_free(fp->rethook); - ret = unregister_ftrace_function(&fp->ops); if (ret < 0) return ret; + if (fp->rethook) + rethook_free(fp->rethook); + ftrace_free_filter(&fp->ops); return ret; -- cgit v1.2.3 From f6d026eea390d59787a6cdc2ef5c983d02e029d0 Mon Sep 17 00:00:00 2001 From: sunliming Date: Mon, 26 Jun 2023 19:13:42 +0800 Subject: tracing/user_events: Fix incorrect return value for writing operation when events are disabled The writing operation return the count of writes regardless of whether events are enabled or disabled. Switch it to return -EBADF to indicates that the event is disabled. Link: https://lkml.kernel.org/r/20230626111344.19136-2-sunliming@kylinos.cn Cc: stable@vger.kernel.org 7f5a08c79df35 ("user_events: Add minimal support for trace_event into ftrace") Acked-by: Beau Belgrave Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 8df0550415e7..09f7d9167b8e 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2096,7 +2096,8 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) if (unlikely(faulted)) return -EFAULT; - } + } else + return -EBADF; return ret; } -- cgit v1.2.3 From 054a73009c22a5fb8bbeee5394980809276bc9fe Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 May 2023 20:55:13 -0400 Subject: module: split up 'finit_module()' into init_module_from_file() helper This will simplify the next step, where we can then key off the inode to do one idempotent module load. Let's do the obvious re-organization in one step, and then the new code in another. Signed-off-by: Linus Torvalds --- kernel/module/main.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 4e2cf784cf8c..23abfe3e2674 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3057,26 +3057,16 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return load_module(&info, uargs, 0); } -SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { struct load_info info = { }; void *buf = NULL; int len; - int err; - - err = may_init_module(); - if (err) - return err; - pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); + if (!f || !(f->f_mode & FMODE_READ)) + return -EBADF; - if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS - |MODULE_INIT_IGNORE_VERMAGIC - |MODULE_INIT_COMPRESSED_FILE)) - return -EINVAL; - - len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, - READING_MODULE); + len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE); if (len < 0) { mod_stat_inc(&failed_kreads); mod_stat_add_long(len, &invalid_kread_bytes); @@ -3084,7 +3074,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) } if (flags & MODULE_INIT_COMPRESSED_FILE) { - err = module_decompress(&info, buf, len); + int err = module_decompress(&info, buf, len); vfree(buf); /* compressed data is no longer needed */ if (err) { mod_stat_inc(&failed_decompress); @@ -3099,6 +3089,28 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) return load_module(&info, uargs, flags); } +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +{ + int err; + struct fd f; + + err = may_init_module(); + if (err) + return err; + + pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); + + if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS + |MODULE_INIT_IGNORE_VERMAGIC + |MODULE_INIT_COMPRESSED_FILE)) + return -EINVAL; + + f = fdget(fd); + err = init_module_from_file(f.file, uargs, flags); + fdput(f); + return err; +} + /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ char *module_flags(struct module *mod, char *buf, bool show_state) { -- cgit v1.2.3 From 9b9879fc03275ffe0da328cf5b864d9e694167c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 May 2023 21:39:51 -0400 Subject: modules: catch concurrent module loads, treat them as idempotent This is the new-and-improved attempt at avoiding huge memory load spikes when the user space boot sequence tries to load hundreds (or even thousands) of redundant duplicate modules in parallel. See commit 9828ed3f695a ("module: error out early on concurrent load of the same module file") for background and an earlier failed attempt that was reverted. That earlier attempt just said "concurrently loading the same module is silly, just open the module file exclusively and return -ETXTBSY if somebody else is already loading it". While it is true that concurrent module loads of the same module is silly, the reason that earlier attempt then failed was that the concurrently loaded module would often be a prerequisite for another module. Thus failing to load the prerequisite would then cause cascading failures of the other modules, rather than just short-circuiting that one unnecessary module load. At the same time, we still really don't want to load the contents of the same module file hundreds of times, only to then wait for an eventually successful load, and have everybody else return -EEXIST. As a result, this takes another approach, and treats concurrent module loads from the same file as "idempotent" in the inode. So if one module load is ongoing, we don't start a new one, but instead just wait for the first one to complete and return the same return value as it did. So unlike the first attempt, this does not return early: the intent is not to speed up the boot, but to avoid a thundering herd problem in allocating memory (both physical and virtual) for a module more than once. Also note that this does change behavior: it used to be that when you had concurrent loads, you'd have one "winner" that would return success, and everybody else would return -EEXIST. In contrast, this idempotent logic goes all Oprah on the problem, and says "You are a winner! And you are a winner! We are ALL winners". But since there's no possible actual real semantic difference between "you loaded the module" and "somebody else already loaded the module", this is more of a feel-good change than an actual honest-to-goodness semantic change. Of course, any true Johnny-come-latelies that don't get caught in the concurrency filter will still return -EEXIST. It's no different from not even getting a seat at an Oprah taping. That's life. See the long thread on the kernel mailing list about this all, which includes some numbers for memory use before and after the patch. Link: https://lore.kernel.org/lkml/20230524213620.3509138-1-mcgrof@kernel.org/ Reviewed-by: Johan Hovold Tested-by: Johan Hovold Tested-by: Luis Chamberlain Tested-by: Dan Williams Tested-by: Rudi Heitbaum Tested-by: David Hildenbrand Signed-off-by: Linus Torvalds --- kernel/module/main.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 23abfe3e2674..d6de66a6a1ac 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3057,15 +3057,82 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return load_module(&info, uargs, 0); } +struct idempotent { + const void *cookie; + struct hlist_node entry; + struct completion complete; + int ret; +}; + +#define IDEM_HASH_BITS 8 +static struct hlist_head idem_hash[1 << IDEM_HASH_BITS]; +static DEFINE_SPINLOCK(idem_lock); + +static bool idempotent(struct idempotent *u, const void *cookie) +{ + int hash = hash_ptr(cookie, IDEM_HASH_BITS); + struct hlist_head *head = idem_hash + hash; + struct idempotent *existing; + bool first; + + u->ret = 0; + u->cookie = cookie; + init_completion(&u->complete); + + spin_lock(&idem_lock); + first = true; + hlist_for_each_entry(existing, head, entry) { + if (existing->cookie != cookie) + continue; + first = false; + break; + } + hlist_add_head(&u->entry, idem_hash + hash); + spin_unlock(&idem_lock); + + return !first; +} + +/* + * We were the first one with 'cookie' on the list, and we ended + * up completing the operation. We now need to walk the list, + * remove everybody - which includes ourselves - fill in the return + * value, and then complete the operation. + */ +static void idempotent_complete(struct idempotent *u, int ret) +{ + const void *cookie = u->cookie; + int hash = hash_ptr(cookie, IDEM_HASH_BITS); + struct hlist_head *head = idem_hash + hash; + struct hlist_node *next; + struct idempotent *pos; + + spin_lock(&idem_lock); + hlist_for_each_entry_safe(pos, next, head, entry) { + if (pos->cookie != cookie) + continue; + hlist_del(&pos->entry); + pos->ret = ret; + complete(&pos->complete); + } + spin_unlock(&idem_lock); +} + static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { + struct idempotent idem; struct load_info info = { }; void *buf = NULL; - int len; + int len, ret; if (!f || !(f->f_mode & FMODE_READ)) return -EBADF; + if (idempotent(&idem, file_inode(f))) { + wait_for_completion(&idem.complete); + return idem.ret; + } + len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE); if (len < 0) { mod_stat_inc(&failed_kreads); @@ -3086,7 +3153,9 @@ static int init_module_from_file(struct file *f, const char __user * uargs, int info.len = len; } - return load_module(&info, uargs, flags); + ret = load_module(&info, uargs, flags); + idempotent_complete(&idem, ret); + return ret; } SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) -- cgit v1.2.3 From aabd12609f91155f26584508b01f548215cc3c0c Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 26 Jun 2023 15:01:03 +0200 Subject: swiotlb: always set the number of areas before allocating the pool The number of areas defaults to the number of possible CPUs. However, the total number of slots may have to be increased after adjusting the number of areas. Consequently, the number of areas must be determined before allocating the memory pool. This is even explained with a comment in swiotlb_init_remap(), but swiotlb_init_late() adjusts the number of areas after slots are already allocated. The areas may end up being smaller than IO_TLB_SEGSIZE, which breaks per-area locking. While fixing swiotlb_init_late(), move all relevant comments before the definition of swiotlb_adjust_nareas() and convert them to kernel-doc. Fixes: 20347fca71a3 ("swiotlb: split up the global swiotlb lock") Signed-off-by: Petr Tesarik Reviewed-by: Roberto Sassu Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 775f7bb10ab1..89db590f931f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -115,9 +115,16 @@ static bool round_up_default_nslabs(void) return true; } +/** + * swiotlb_adjust_nareas() - adjust the number of areas and slots + * @nareas: Desired number of areas. Zero is treated as 1. + * + * Adjust the default number of areas in a memory pool. + * The default size of the memory pool may also change to meet minimum area + * size requirements. + */ static void swiotlb_adjust_nareas(unsigned int nareas) { - /* use a single area when non is specified */ if (!nareas) nareas = 1; else if (!is_power_of_2(nareas)) @@ -298,10 +305,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, if (swiotlb_force_disable) return; - /* - * default_nslabs maybe changed when adjust area number. - * So allocate bounce buffer after adjusting area number. - */ if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); @@ -363,6 +366,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, if (swiotlb_force_disable) return 0; + if (!default_nareas) + swiotlb_adjust_nareas(num_possible_cpus()); + retry: order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; @@ -397,9 +403,6 @@ retry: (PAGE_SIZE << order) >> 20); } - if (!default_nareas) - swiotlb_adjust_nareas(num_possible_cpus()); - area_order = get_order(array_size(sizeof(*mem->areas), default_nareas)); mem->areas = (struct io_tlb_area *) -- cgit v1.2.3 From 8ac04063354a01a484d2e55d20ed1958aa0d3392 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 26 Jun 2023 15:01:04 +0200 Subject: swiotlb: reduce the number of areas to match actual memory pool size Although the desired size of the SWIOTLB memory pool is increased in swiotlb_adjust_nareas() to match the number of areas, the actual allocation may be smaller, which may require reducing the number of areas. For example, Xen uses swiotlb_init_late(), which in turn uses the page allocator. On x86, page size is 4 KiB and MAX_ORDER is 10 (1024 pages), resulting in a maximum memory pool size of 4 MiB. This corresponds to 2048 slots of 2 KiB each. The minimum area size is 128 (IO_TLB_SEGSIZE), allowing at most 2048 / 128 = 16 areas. If num_possible_cpus() is greater than the maximum number of areas, areas are smaller than IO_TLB_SEGSIZE and contiguous groups of free slots will span multiple areas. When allocating and freeing slots, only one area will be properly locked, causing race conditions on the unlocked slots and ultimately data corruption, kernel hangs and crashes. Fixes: 20347fca71a3 ("swiotlb: split up the global swiotlb lock") Signed-off-by: Petr Tesarik Reviewed-by: Roberto Sassu Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 89db590f931f..2b83e3ad9dca 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -138,6 +138,23 @@ static void swiotlb_adjust_nareas(unsigned int nareas) (default_nslabs << IO_TLB_SHIFT) >> 20); } +/** + * limit_nareas() - get the maximum number of areas for a given memory pool size + * @nareas: Desired number of areas. + * @nslots: Total number of slots in the memory pool. + * + * Limit the number of areas to the maximum possible number of areas in + * a memory pool of the given size. + * + * Return: Maximum possible number of areas. + */ +static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots) +{ + if (nslots < nareas * IO_TLB_SEGSIZE) + return nslots / IO_TLB_SEGSIZE; + return nareas; +} + static int __init setup_io_tlb_npages(char *str) { @@ -297,6 +314,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, { struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs; + unsigned int nareas; size_t alloc_size; void *tlb; @@ -309,10 +327,12 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; + nareas = limit_nareas(default_nareas, nslabs); while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) { if (nslabs <= IO_TLB_MIN_SLABS) return; nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); + nareas = limit_nareas(nareas, nslabs); } if (default_nslabs != nslabs) { @@ -358,6 +378,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, { struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + unsigned int nareas; unsigned char *vstart = NULL; unsigned int order, area_order; bool retried = false; @@ -403,8 +424,8 @@ retry: (PAGE_SIZE << order) >> 20); } - area_order = get_order(array_size(sizeof(*mem->areas), - default_nareas)); + nareas = limit_nareas(default_nareas, nslabs); + area_order = get_order(array_size(sizeof(*mem->areas), nareas)); mem->areas = (struct io_tlb_area *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order); if (!mem->areas) @@ -418,7 +439,7 @@ retry: set_memory_decrypted((unsigned long)vstart, (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true, - default_nareas); + nareas); swiotlb_print_info(); return 0; -- cgit v1.2.3 From 0914e4d3cda853471a72b2c26a516c7619658b87 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 14:47:53 +0200 Subject: kdb: include kdb_private.h for function prototypes The kdb_kbd_cleanup_state() is called from another file through the kdb_private.h file, but that is not included before the definition, causing a W=1 warning: kernel/debug/kdb/kdb_keyboard.c:198:6: error: no previous prototype for 'kdb_kbd_cleanup_state' [-Werror=missing-prototypes] Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230517124802.929751-1-arnd@kernel.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_keyboard.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index f87c750d3eb3..3c2987f46f6e 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -13,6 +13,8 @@ #include #include +#include "kdb_private.h" + /* Keyboard Controller Registers on normal PCs. */ #define KBD_STATUS_REG 0x64 /* Status register (R) */ -- cgit v1.2.3 From 554588e8e932e7a0fac7d3ae2132f2b727d9acfe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 7 Jun 2023 14:08:54 +0200 Subject: sysctl: fix unused proc_cap_handler() function warning Since usermodehelper_table() is marked static now, we get a warning about it being unused when SYSCTL is disabled: kernel/umh.c:497:12: error: 'proc_cap_handler' defined but not used [-Werror=unused-function] Just move it inside of the same #ifdef. Signed-off-by: Arnd Bergmann Tested-by: Matthieu Baerts Fixes: 861dc0b46432 ("sysctl: move umh sysctl registration to its own file") Acked-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested [mcgrof: adjust new commit ID for Fixes tag] Signed-off-by: Luis Chamberlain --- kernel/umh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 41088c5c39fd..1b13c5d34624 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -494,6 +494,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) } EXPORT_SYMBOL(call_usermodehelper); +#if defined(CONFIG_SYSCTL) static int proc_cap_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -544,7 +545,6 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } -#if defined(CONFIG_SYSCTL) static struct ctl_table usermodehelper_table[] = { { .procname = "bset", -- cgit v1.2.3 From 67a4e1a3bf7c68ed3fbefc4213648165d912cabb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 28 Jun 2023 18:02:51 +0300 Subject: irqdomain: Use return value of strreplace() Since strreplace() returns the pointer to the string itself, use it directly. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230628150251.17832-1-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5bd01624e447..0bdef4fe925b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -182,9 +182,7 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode, return NULL; } - strreplace(name, '/', ':'); - - domain->name = name; + domain->name = strreplace(name, '/', ':'); domain->fwnode = fwnode; domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; } -- cgit v1.2.3 From 1ed0555850cdaa3113a32891d273f7a859565264 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 28 Jun 2023 12:56:17 -0700 Subject: kdb: Handle LF in the command parser The main kdb command parser only handles CR (ASCII 13 AKA '\r') today, but not LF (ASCII 10 AKA '\n'). That means that the kdb command parser can handle terminals that send just CR or that send CR+LF but can't handle terminals that send just LF. The fact that kdb didn't handle LF in the command parser tripped up a tool I tried to use with it. Specifically, I was trying to send a command to my device to resume it from kdb using a ChromeOS tool like: dut-control cpu_uart_cmd:"g" That tool only terminates lines with LF, not CR+LF. Arguably the ChromeOS tool should be fixed. After all, officially kdb seems to be designed such that CR+LF is the official line ending transmitted over the wire and that internally a line ending is just '\n' (LF). Some evidence: * uart_poll_put_char(), which is used by kdb, notices a '\n' and converts it to '\r\n'. * kdb functions specifically use '\r' to get a carriage return without a newline. You can see this in the pager where kdb will write a '\r' and then write over the pager prompt. However, all that being said there's no real harm in accepting LF as a command terminator in the kdb parser and doing so seems like it would improve compatibility. After this, I'd expect that things would work OK-ish with a remote terminal that used any of CR, CR+LF, or LF as a line ending. Someone using CR as a line ending might get some ugliness where kdb wasn't able to overwrite the last line, but basic commands would work. Someone using just LF as a line ending would probably also work OK. A few other notes: - It can be noted that "bash" running on an "agetty" handles LF as a line termination with no complaints. - Historically, kdb's "pager" actually handled either CR or LF fine. A very quick inspection would make one think that kdb's pager actually could have paged down two lines instead of one for anyone using CR+LF, but this is generally avoided because of kdb_input_flush(). - Conceivably one could argue that some of this special case logic belongs in uart_poll_get_char() since uart_poll_put_char() handles the '\n' => '\r\n' conversion. I would argue that perhaps we should eventually do the opposite and move the '\n' => '\r\n' out of uart_poll_put_char(). Having that conversion at such a low level could interfere if we ever want to transfer binary data. In addition, if we truly made uart_poll_get_char() the inverse of uart_poll_put_char() it would convert back to '\n' and (ironically) kdb's parser currently only looks for '\r' to find the end of a command. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20230628125612.1.I5cc6c3d916195f5bcfdf5b75d823f2037707f5dc@changeid Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 5c7e9ba7cd6b..813cb6cf72d6 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -131,6 +131,7 @@ char kdb_getchar(void) int escape_delay = 0; get_char_func *f, *f_prev = NULL; int key; + static bool last_char_was_cr; for (f = &kdb_poll_funcs[0]; ; ++f) { if (*f == NULL) { @@ -149,6 +150,18 @@ char kdb_getchar(void) continue; } + /* + * The caller expects that newlines are either CR or LF. However + * some terminals send _both_ CR and LF. Avoid having to handle + * this in the caller by stripping the LF if we saw a CR right + * before. + */ + if (last_char_was_cr && key == '\n') { + last_char_was_cr = false; + continue; + } + last_char_was_cr = (key == '\r'); + /* * When the first character is received (or we get a change * input source) we set ourselves up to handle an escape @@ -244,7 +257,8 @@ poll_again: *cp = tmp; } break; - case 13: /* enter */ + case 10: /* linefeed */ + case 13: /* carriage return */ *lastchar++ = '\n'; *lastchar++ = '\0'; if (!KDB_STATE(KGDB_TRANS)) { -- cgit v1.2.3 From b69f0aeb068980af983d399deafc7477cec8bc04 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 30 Jun 2023 09:46:17 +0200 Subject: pid: Replace struct pid 1-element array with flex-array For pid namespaces, struct pid uses a dynamically sized array member, "numbers". This was implemented using the ancient 1-element fake flexible array, which has been deprecated for decades. Replace it with a C99 flexible array, refactor the array size calculations to use struct_size(), and address elements via indexes. Note that the static initializer (which defines a single element) works as-is, and requires no special handling. Without this, CONFIG_UBSAN_BOUNDS (and potentially CONFIG_FORTIFY_SOURCE) will trigger bounds checks: https://lore.kernel.org/lkml/20230517-bushaltestelle-super-e223978c1ba6@brauner Cc: Christian Brauner Cc: Jan Kara Cc: Jeff Xu Cc: Andreas Gruenbacher Cc: Daniel Verkamp Cc: "Paul E. McKenney" Cc: Jeff Xu Cc: Andrew Morton Cc: Boqun Feng Cc: Luis Chamberlain Cc: Frederic Weisbecker Reported-by: syzbot+ac3b41786a2d0565b6d5@syzkaller.appspotmail.com [brauner: dropped unrelated changes and remove 0 with NULL cast] Signed-off-by: Kees Cook Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- kernel/pid.c | 7 +++++-- kernel/pid_namespace.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index f93954a0384d..8bce3aebc949 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -656,8 +656,11 @@ void __init pid_idr_init(void) idr_init(&init_pid_ns.idr); - init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + init_pid_ns.pid_cachep = kmem_cache_create("pid", + struct_size((struct pid *)NULL, numbers, 1), + __alignof__(struct pid), + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); } static struct file *__pidfd_fget(struct task_struct *task, int fd) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b43eee07b00c..70a929784a5d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -48,7 +48,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) return kc; snprintf(name, sizeof(name), "pid_%u", level + 1); - len = sizeof(struct pid) + level * sizeof(struct upid); + len = struct_size((struct pid *)NULL, numbers, level + 1); mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) -- cgit v1.2.3 From dd546618ba704be4f3724a11e5a194052c551f08 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 1 Jul 2023 08:44:44 +0200 Subject: pid: use struct_size_t() helper Before commit d67790ddf021 ("overflow: Add struct_size_t() helper") only struct_size() existed, which expects a valid pointer instance containing the flexible array. However, when we determine the default struct pid allocation size for the associated kmem cache of a pid namespace we need to take the nesting depth of the pid namespace into account without an variable instance necessarily being available. In commit b69f0aeb0689 ("pid: Replace struct pid 1-element array with flex-array") we used to handle this the old fashioned way and cast NULL to a struct pid pointer type. However, we do apparently have a dedicated struct_size_t() helper for exactly this case. So switch to that. Suggested-by: Kees Cook Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- kernel/pid.c | 2 +- kernel/pid_namespace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 8bce3aebc949..6a1d23a11026 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -657,7 +657,7 @@ void __init pid_idr_init(void) idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = kmem_cache_create("pid", - struct_size((struct pid *)NULL, numbers, 1), + struct_size_t(struct pid, numbers, 1), __alignof__(struct pid), SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 70a929784a5d..0bf44afe04dd 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -48,7 +48,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) return kc; snprintf(name, sizeof(name), "pid_%u", level + 1); - len = struct_size((struct pid *)NULL, numbers, level + 1); + len = struct_size_t(struct pid, numbers, level + 1); mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) -- cgit v1.2.3 From b6464883f45ae6412de33e53587974fd86ba811e Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 30 Jun 2023 21:12:06 +0100 Subject: kdb: move kdb_send_sig() declaration to a better header file kdb_send_sig() is defined in the signal code and called from kdb, but the declaration is part of the kdb internal code. Move the declaration to the shared header to avoid the warning: kernel/signal.c:4789:6: error: no previous prototype for 'kdb_send_sig' [-Werror=missing-prototypes] Reported-by: Arnd Bergmann Closes: https://lore.kernel.org/lkml/20230517125423.930967-1-arnd@kernel.org/ Signed-off-by: Daniel Thompson Link: https://lore.kernel.org/r/20230630201206.2396930-1-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_private.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 1f8c519a5f81..548fd4059bf9 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -194,7 +194,6 @@ extern char kdb_task_state_char (const struct task_struct *); extern bool kdb_task_state(const struct task_struct *p, const char *mask); extern void kdb_ps_suppressed(void); extern void kdb_ps1(const struct task_struct *p); -extern void kdb_send_sig(struct task_struct *p, int sig); extern char kdb_getchar(void); extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); -- cgit v1.2.3 From 3de4d22cc9ac7c9f38e10edcf54f9a8891a9c2aa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 1 Jul 2023 17:14:47 +0000 Subject: bpf, btf: Warn but return no error for NULL btf from __register_btf_kfunc_id_set() __register_btf_kfunc_id_set() assumes .BTF to be part of the module's .ko file if CONFIG_DEBUG_INFO_BTF is enabled. If that's not the case, the function prints an error message and return an error. As a result, such modules cannot be loaded. However, the section could be stripped out during a build process. It would be better to let the modules loaded, because their basic functionalities have no problem [0], though the BTF functionalities will not be supported. Make the function to lower the level of the message from error to warn, and return no error. [0] https://lore.kernel.org/bpf/20220219082037.ow2kbq5brktf4f2u@apollo.legion Fixes: c446fdacb10d ("bpf: fix register_btf_kfunc_id_set for !CONFIG_DEBUG_INFO_BTF") Reported-by: Alexander Egorenkov Suggested-by: Kumar Kartikeya Dwivedi Signed-off-by: SeongJae Park Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/87y228q66f.fsf@oc8242746057.ibm.com Link: https://lore.kernel.org/bpf/20220219082037.ow2kbq5brktf4f2u@apollo.legion Link: https://lore.kernel.org/bpf/20230701171447.56464-1-sj@kernel.org --- kernel/bpf/btf.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 29fe21099298..817204d53372 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7891,10 +7891,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, pr_err("missing vmlinux BTF, cannot register kfuncs\n"); return -ENOENT; } - if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { - pr_err("missing module BTF, cannot register kfuncs\n"); - return -ENOENT; - } + if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + pr_warn("missing module BTF, cannot register kfuncs\n"); return 0; } if (IS_ERR(btf)) -- cgit v1.2.3 From f1962207150c8b602e980616f04b37ea4e64bb9f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 4 Jul 2023 06:37:32 -0700 Subject: module: fix init_module_from_file() error handling Vegard Nossum pointed out two different problems with the error handling in init_module_from_file(): (a) the idempotent loading code didn't clean up properly in some error cases, leaving the on-stack 'struct idempotent' element still in the hash table (b) failure to read the module file would nonsensically update the 'invalid_kread_bytes' stat counter with the error value The first error is quite nasty, in that it can then cause subsequent idempotent loads of that same file to access stale stack contents of the previous failure. The case may not happen in any normal situation (explaining all the "Tested-by's on the original change), and requires admin privileges, but syzkaller triggers random bad behavior as a result: BUG: soft lockup in sys_finit_module BUG: unable to handle kernel paging request in init_module_from_file general protection fault in init_module_from_file INFO: task hung in init_module_from_file KASAN: out-of-bounds Read in init_module_from_file KASAN: slab-out-of-bounds Read in init_module_from_file ... The second error is fairly benign and just leads to nonsensical stats (and has been around since the debug stats were added). Vegard also provided a patch for the idempotent loading issue, but I'd rather re-organize the code and make it more legible using another level of helper functions than add the usual "goto out" error handling. Link: https://lore.kernel.org/lkml/20230704100852.23452-1-vegard.nossum@oracle.com/ Fixes: 9b9879fc0327 ("modules: catch concurrent module loads, treat them as idempotent") Reported-by: Vegard Nossum Reported-by: Harshit Mogalapalli Reported-by: syzbot+9c2bdc9d24e4a7abe741@syzkaller.appspotmail.com Signed-off-by: Linus Torvalds --- kernel/module/main.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 834de86ebe35..59b1d067e528 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3092,7 +3092,7 @@ static bool idempotent(struct idempotent *u, const void *cookie) * remove everybody - which includes ourselves - fill in the return * value, and then complete the operation. */ -static void idempotent_complete(struct idempotent *u, int ret) +static int idempotent_complete(struct idempotent *u, int ret) { const void *cookie = u->cookie; int hash = hash_ptr(cookie, IDEM_HASH_BITS); @@ -3109,27 +3109,18 @@ static void idempotent_complete(struct idempotent *u, int ret) complete(&pos->complete); } spin_unlock(&idem_lock); + return ret; } static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { - struct idempotent idem; struct load_info info = { }; void *buf = NULL; - int len, ret; - - if (!f || !(f->f_mode & FMODE_READ)) - return -EBADF; - - if (idempotent(&idem, file_inode(f))) { - wait_for_completion(&idem.complete); - return idem.ret; - } + int len; len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE); if (len < 0) { mod_stat_inc(&failed_kreads); - mod_stat_add_long(len, &invalid_kread_bytes); return len; } @@ -3146,9 +3137,25 @@ static int init_module_from_file(struct file *f, const char __user * uargs, int info.len = len; } - ret = load_module(&info, uargs, flags); - idempotent_complete(&idem, ret); - return ret; + return load_module(&info, uargs, flags); +} + +static int idempotent_init_module(struct file *f, const char __user * uargs, int flags) +{ + struct idempotent idem; + + if (!f || !(f->f_mode & FMODE_READ)) + return -EBADF; + + /* See if somebody else is doing the operation? */ + if (idempotent(&idem, file_inode(f))) { + wait_for_completion(&idem.complete); + return idem.ret; + } + + /* Otherwise, we'll do it and complete others */ + return idempotent_complete(&idem, + init_module_from_file(f, uargs, flags)); } SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) @@ -3168,7 +3175,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) return -EINVAL; f = fdget(fd); - err = init_module_from_file(f.file, uargs, flags); + err = idempotent_init_module(f.file, uargs, flags); fdput(f); return err; } -- cgit v1.2.3 From 02b0095e2fbbc060560c1065f86a211d91e27b26 Mon Sep 17 00:00:00 2001 From: Mateusz Stachyra Date: Tue, 4 Jul 2023 12:27:06 +0200 Subject: tracing: Fix null pointer dereference in tracing_err_log_open() Fix an issue in function 'tracing_err_log_open'. The function doesn't call 'seq_open' if the file is opened only with write permissions, which results in 'file->private_data' being left as null. If we then use 'lseek' on that opened file, 'seq_lseek' dereferences 'file->private_data' in 'mutex_lock(&m->lock)', resulting in a kernel panic. Writing to this node requires root privileges, therefore this bug has very little security impact. Tracefs node: /sys/kernel/tracing/error_log Example Kernel panic: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000038 Call trace: mutex_lock+0x30/0x110 seq_lseek+0x34/0xb8 __arm64_sys_lseek+0x6c/0xb8 invoke_syscall+0x58/0x13c el0_svc_common+0xc4/0x10c do_el0_svc+0x24/0x98 el0_svc+0x24/0x88 el0t_64_sync_handler+0x84/0xe4 el0t_64_sync+0x1b4/0x1b8 Code: d503201f aa0803e0 aa1f03e1 aa0103e9 (c8e97d02) ---[ end trace 561d1b49c12cf8a5 ]--- Kernel panic - not syncing: Oops: Fatal exception Link: https://lore.kernel.org/linux-trace-kernel/20230703155237eucms1p4dfb6a19caa14c79eb6c823d127b39024@eucms1p4 Link: https://lore.kernel.org/linux-trace-kernel/20230704102706eucms1p30d7ecdcc287f46ad67679fc8491b2e0f@eucms1p3 Cc: stable@vger.kernel.org Fixes: 8a062902be725 ("tracing: Add tracing error log") Signed-off-by: Mateusz Stachyra Suggested-by: Steven Rostedt Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64a4dde073ef..3d34e6fea6b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8135,7 +8135,7 @@ static const struct file_operations tracing_err_log_fops = { .open = tracing_err_log_open, .write = tracing_err_log_write, .read = seq_read, - .llseek = seq_lseek, + .llseek = tracing_lseek, .release = tracing_err_log_release, }; -- cgit v1.2.3 From fddca7db4a4c17f7333793dfb5308d80c76d2896 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 4 Jul 2023 10:08:07 -0400 Subject: tracing/boot: Test strscpy() against less than zero for error Instead of checking for -E2BIG, it is better to just check for less than zero of strscpy() for error. Testing for -E2BIG is not very robust, and the calling code does not really care about the error code, just that there was an error. One of the updates to convert strlcpy() to strscpy() had a v2 version that changed the test from testing against -E2BIG to less than zero, but I took the v1 version that still tested for -E2BIG. Link: https://lore.kernel.org/linux-trace-kernel/20230615180420.400769-1-azeemshaikh38@gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20230704100807.707d1605@rorschach.local.home Cc: Mark Rutland Cc: Azeem Shaikh Cc: Kees Cook Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_boot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 5fe525f1b8cc..7ccc7a8e155b 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) /* Common ftrace options */ xbc_node_for_each_array_value(node, "options", anode, p) { - if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) { pr_err("String is too long: %s\n", p); continue; } @@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) const char *p; xbc_node_for_each_array_value(node, "events", anode, p) { - if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) { pr_err("String is too long: %s\n", p); continue; } @@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, p = xbc_node_find_value(enode, "filter", NULL); if (p && *p != '\0') { - if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) pr_err("filter string is too long: %s\n", p); else if (apply_event_filter(file, buf) < 0) pr_err("Failed to apply filter: %s\n", buf); @@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); -- cgit v1.2.3 From 5415ccd50a8620c8cbaa32d6f18c946c453566f5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 5 Jul 2023 20:17:29 +0530 Subject: bpf: Fix max stack depth check for async callbacks The check_max_stack_depth pass happens after the verifier's symbolic execution, and attempts to walk the call graph of the BPF program, ensuring that the stack usage stays within bounds for all possible call chains. There are two cases to consider: bpf_pseudo_func and bpf_pseudo_call. In the former case, the callback pointer is loaded into a register, and is assumed that it is passed to some helper later which calls it (however there is no way to be sure), but the check remains conservative and accounts the stack usage anyway. For this particular case, asynchronous callbacks are skipped as they execute asynchronously when their corresponding event fires. The case of bpf_pseudo_call is simpler and we know that the call is definitely made, hence the stack depth of the subprog is accounted for. However, the current check still skips an asynchronous callback even if a bpf_pseudo_call was made for it. This is erroneous, as it will miss accounting for the stack usage of the asynchronous callback, which can be used to breach the maximum stack depth limit. Fix this by only skipping asynchronous callbacks when the instruction is not a pseudo call to the subprog. Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230705144730.235802-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11e54dd8b6dd..930b5555cfd3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5642,8 +5642,9 @@ continue_func: verbose(env, "verifier bug. subprog has tail_call and async cb\n"); return -EFAULT; } - /* async callbacks don't increase bpf prog stack size */ - continue; + /* async callbacks don't increase bpf prog stack size unless called directly */ + if (!bpf_pseudo_call(insn + i)) + continue; } i = next_insn; -- cgit v1.2.3 From fb49c455323ff8319a123dd312be9082c49a23a5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 8 Jul 2023 12:12:12 -0700 Subject: fork: lock VMAs of the parent process when forking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When forking a child process, the parent write-protects anonymous pages and COW-shares them with the child being forked using copy_present_pte(). We must not take any concurrent page faults on the source vma's as they are being processed, as we expect both the vma and the pte's behind it to be stable. For example, the anon_vma_fork() expects the parents vma->anon_vma to not change during the vma copy. A concurrent page fault on a page newly marked read-only by the page copy might trigger wp_page_copy() and a anon_vma_prepare(vma) on the source vma, defeating the anon_vma_clone() that wasn't done because the parent vma originally didn't have an anon_vma, but we now might end up copying a pte entry for a page that has one. Before the per-vma lock based changes, the mmap_lock guaranteed exclusion with concurrent page faults. But now we need to do a vma_start_write() to make sure no concurrent faults happen on this vma while it is being processed. This fix can potentially regress some fork-heavy workloads. Kernel build time did not show noticeable regression on a 56-core machine while a stress test mapping 10000 VMAs and forking 5000 times in a tight loop shows ~5% regression. If such fork time regression is unacceptable, disabling CONFIG_PER_VMA_LOCK should restore its performance. Further optimizations are possible if this regression proves to be problematic. Suggested-by: David Hildenbrand Reported-by: Jiri Slaby Closes: https://lore.kernel.org/all/dbdef34c-3a07-5951-e1ae-e9c6e3cdf51b@kernel.org/ Reported-by: Holger Hoffstätte Closes: https://lore.kernel.org/all/b198d649-f4bf-b971-31d0-e8433ec2a34c@applied-asynchrony.com/ Reported-by: Jacob Young Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217624 Fixes: 0bff0aaea03e ("x86/mm: try VMA lock-based page fault handling first") Cc: stable@vger.kernel.org Signed-off-by: Suren Baghdasaryan Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b85814e614a5..d2e12b6d2b18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -686,6 +686,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, for_each_vma(old_vmi, mpnt) { struct file *file; + vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; -- cgit v1.2.3 From ae2ad293d6be143ad223f5f947cca07bcbe42595 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 20 Jun 2023 16:07:47 +0800 Subject: sched/fair: Use recent_used_cpu to test p->cpus_ptr When checking whether a recently used CPU can be a potential idle candidate, recent_used_cpu should be used to test p->cpus_ptr as p->recent_used_cpu is not equal to recent_used_cpu and candidate decision is made based on recent_used_cpu here. Fixes: 89aafd67f28c ("sched/fair: Use prev instead of new target as recent_used_cpu") Signed-off-by: Miaohe Lin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20230620080747.359122-1-linmiaohe@huawei.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a80a73909dc2..b3e25be58e2b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7174,7 +7174,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { return recent_used_cpu; } -- cgit v1.2.3 From aff037078ecaecf34a7c2afab1341815f90fba5e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 29 Jun 2023 17:56:12 -0700 Subject: sched/psi: use kernfs polling functions for PSI trigger polling Destroying psi trigger in cgroup_file_release causes UAF issues when a cgroup is removed from under a polling process. This is happening because cgroup removal causes a call to cgroup_file_release while the actual file is still alive. Destroying the trigger at this point would also destroy its waitqueue head and if there is still a polling process on that file accessing the waitqueue, it will step on the freed pointer: do_select vfs_poll do_rmdir cgroup_rmdir kernfs_drain_open_files cgroup_file_release cgroup_pressure_release psi_trigger_destroy wake_up_pollfree(&t->event_wait) // vfs_poll is unblocked synchronize_rcu kfree(t) poll_freewait -> UAF access to the trigger's waitqueue head Patch [1] fixed this issue for epoll() case using wake_up_pollfree(), however the same issue exists for synchronous poll() case. The root cause of this issue is that the lifecycles of the psi trigger's waitqueue and of the file associated with the trigger are different. Fix this by using kernfs_generic_poll function when polling on cgroup-specific psi triggers. It internally uses kernfs_open_node->poll waitqueue head with its lifecycle tied to the file's lifecycle. This also renders the fix in [1] obsolete, so revert it. [1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()") Fixes: 0e94682b73bf ("psi: introduce psi monitor") Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com/ Reported-by: Lu Jialin Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230630005612.1014540-1-surenb@google.com --- kernel/cgroup/cgroup.c | 2 +- kernel/sched/psi.c | 29 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bfe3cd8ccf36..f55a40db065f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3730,7 +3730,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_psi(cgrp); - new = psi_trigger_create(psi, buf, res, of->file); + new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 81fca77397f6..9bb3f2b3ccfc 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -493,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, continue; /* Generate an event */ - if (cmpxchg(&t->event, 0, 1) == 0) - wake_up_interruptible(&t->event_wait); + if (cmpxchg(&t->event, 0, 1) == 0) { + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + } t->last_event_time = now; /* Reset threshold breach flag once event got generated */ t->pending_event = false; @@ -1271,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, enum psi_res res, struct file *file) +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of) { struct psi_trigger *t; enum psi_states state; @@ -1331,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; - init_waitqueue_head(&t->event_wait); + t->of = of; + if (!of) + init_waitqueue_head(&t->event_wait); t->pending_event = false; t->aggregator = privileged ? PSI_POLL : PSI_AVGS; @@ -1388,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t) * being accessed later. Can happen if cgroup is deleted from under a * polling process. */ - wake_up_pollfree(&t->event_wait); + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); if (t->aggregator == PSI_AVGS) { mutex_lock(&group->avgs_lock); @@ -1465,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - poll_wait(file, &t->event_wait, wait); + if (t->of) + kernfs_generic_poll(t->of, wait); + else + poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; @@ -1535,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, res, file); + new = psi_trigger_create(&psi_system, buf, res, file, NULL); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); -- cgit v1.2.3 From cf0a624dc706c306294c14e6b3e7694702f25191 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Mon, 3 Jul 2023 07:28:53 +0300 Subject: kernel/trace: Fix cleanup logic of enable_trace_eprobe The enable_trace_eprobe() function enables all event probes, attached to given trace probe. If an error occurs in enabling one of the event probes, all others should be roll backed. There is a bug in that roll back logic - instead of all event probes, only the failed one is disabled. Link: https://lore.kernel.org/all/20230703042853.1427493-1-tz.stoyanov@gmail.com/ Reported-by: Dan Carpenter Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Tzvetomir Stoyanov (VMware) Acked-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 67e854979d53..3f04f0ffe0d7 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -675,6 +675,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, struct trace_eprobe *ep; bool enabled; int ret = 0; + int cnt = 0; tp = trace_probe_primary_from_call(call); if (WARN_ON_ONCE(!tp)) @@ -698,12 +699,25 @@ static int enable_trace_eprobe(struct trace_event_call *call, if (ret) break; enabled = true; + cnt++; } if (ret) { /* Failed to enable one of them. Roll back all */ - if (enabled) - disable_eprobe(ep, file->tr); + if (enabled) { + /* + * It's a bug if one failed for something other than memory + * not being available but another eprobe succeeded. + */ + WARN_ON_ONCE(ret != -ENOMEM); + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + disable_eprobe(ep, file->tr); + if (!--cnt) + break; + } + } if (file) trace_probe_remove_file(tp, file); else -- cgit v1.2.3 From 5f0c584daf7464f04114c65dd07269ee2bfedc13 Mon Sep 17 00:00:00 2001 From: Ze Gao Date: Mon, 3 Jul 2023 17:23:36 +0800 Subject: fprobe: add unlock to match a succeeded ftrace_test_recursion_trylock Unlock ftrace recursion lock when fprobe_kprobe_handler() is failed because of some running kprobe. Link: https://lore.kernel.org/all/20230703092336.268371-1-zegao@tencent.com/ Fixes: 3cc4e2c5fbae ("fprobe: make fprobe_kprobe_handler recursion free") Reported-by: Yafang Closes: https://lore.kernel.org/linux-trace-kernel/CALOAHbC6UpfFOOibdDiC7xFc5YFUgZnk3MZ=3Ny6we=AcrNbew@mail.gmail.com/ Signed-off-by: Ze Gao Acked-by: Masami Hiramatsu (Google) Acked-by: Yafang Shao Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 0121e8c0d54e..58e4b3607aef 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -102,12 +102,14 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, if (unlikely(kprobe_running())) { fp->nmissed++; - return; + goto recursion_unlock; } kprobe_busy_begin(); __fprobe_handler(ip, parent_ip, ops, fregs); kprobe_busy_end(); + +recursion_unlock: ftrace_test_recursion_unlock(bit); } -- cgit v1.2.3 From e1164787f22b51010cfdc6a5e41b744435836b79 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 5 Jul 2023 03:43:59 +0800 Subject: =?UTF-8?q?kprobes:=20Remove=20unnecessary=20=E2=80=98NULL?= =?UTF-8?q?=E2=80=99=20values=20from=20correct=5Fret=5Faddr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'correct_ret_addr' pointer is always set in the later code, no need to initialize it at definition time. Link: https://lore.kernel.org/all/20230704194359.3124-1-zeming@nfschina.com/ Signed-off-by: Li zeming Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 00e177de91cc..fd713ab439c2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2007,9 +2007,9 @@ void __weak arch_kretprobe_fixup_return(struct pt_regs *regs, unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, void *frame_pointer) { - kprobe_opcode_t *correct_ret_addr = NULL; struct kretprobe_instance *ri = NULL; struct llist_node *first, *node = NULL; + kprobe_opcode_t *correct_ret_addr; struct kretprobe *rp; /* Find correct address and all nodes for this frame. */ -- cgit v1.2.3 From ed9492dfef8738ca68879f5690dda5a04f1897dc Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 12 Jul 2023 02:53:53 +0800 Subject: =?UTF-8?q?kernel:=20kprobes:=20Remove=20unnecessary=20=E2=80=980?= =?UTF-8?q?=E2=80=99=20values?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit it is assigned first, so it does not need to initialize the assignment. Link: https://lore.kernel.org/all/20230711185353.3218-1-zeming@nfschina.com/ Signed-off-by: Li zeming Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fd713ab439c2..a009c8ccd8ea 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1072,7 +1072,7 @@ static int kprobe_ftrace_enabled; static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { - int ret = 0; + int ret; lockdep_assert_held(&kprobe_mutex); @@ -1110,7 +1110,7 @@ static int arm_kprobe_ftrace(struct kprobe *p) static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { - int ret = 0; + int ret; lockdep_assert_held(&kprobe_mutex); @@ -2692,7 +2692,7 @@ void kprobe_free_init_mem(void) static int __init init_kprobes(void) { - int i, err = 0; + int i, err; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ -- cgit v1.2.3 From 195b9cb5b288fec1c871ef89f78cc9a7461aad3a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 7 Jul 2023 23:03:19 +0900 Subject: fprobe: Ensure running fprobe_exit_handler() finished before calling rethook_free() Ensure running fprobe_exit_handler() has finished before calling rethook_free() in the unregister_fprobe() so that caller can free the fprobe right after unregister_fprobe(). unregister_fprobe() ensured that all running fprobe_entry/exit_handler() have finished by calling unregister_ftrace_function() which synchronizes RCU. But commit 5f81018753df ("fprobe: Release rethook after the ftrace_ops is unregistered") changed to call rethook_free() after unregister_ftrace_function(). So call rethook_stop() to make rethook disabled before unregister_ftrace_function() and ensure it again. Here is the possible code flow that can call the exit handler after unregister_fprobe(). ------ CPU1 CPU2 call unregister_fprobe(fp) ... __fprobe_handler() rethook_hook() on probed function unregister_ftrace_function() return from probed function rethook hooks find rh->handler == fprobe_exit_handler call fprobe_exit_handler() rethook_free(): set rh->handler = NULL; return from unreigster_fprobe; call fp->exit_handler() <- (*) ------ (*) At this point, the exit handler is called after returning from unregister_fprobe(). This fixes it as following; ------ CPU1 CPU2 call unregister_fprobe() ... rethook_stop(): set rh->handler = NULL; __fprobe_handler() rethook_hook() on probed function unregister_ftrace_function() return from probed function rethook hooks find rh->handler == NULL return from rethook rethook_free() return from unreigster_fprobe; ------ Link: https://lore.kernel.org/all/168873859949.156157.13039240432299335849.stgit@devnote2/ Fixes: 5f81018753df ("fprobe: Release rethook after the ftrace_ops is unregistered") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/fprobe.c | 3 +++ kernel/trace/rethook.c | 13 +++++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 58e4b3607aef..2571f7f3d5f2 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -366,6 +366,9 @@ int unregister_fprobe(struct fprobe *fp) fp->ops.saved_func != fprobe_kprobe_handler)) return -EINVAL; + if (fp->rethook) + rethook_stop(fp->rethook); + ret = unregister_ftrace_function(&fp->ops); if (ret < 0) return ret; diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 60f6cb2b486b..468006cce7ca 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -53,6 +53,19 @@ static void rethook_free_rcu(struct rcu_head *head) kfree(rh); } +/** + * rethook_stop() - Stop using a rethook. + * @rh: the struct rethook to stop. + * + * Stop using a rethook to prepare for freeing it. If you want to wait for + * all running rethook handler before calling rethook_free(), you need to + * call this first and wait RCU, and call rethook_free(). + */ +void rethook_stop(struct rethook *rh) +{ + WRITE_ONCE(rh->handler, NULL); +} + /** * rethook_free() - Free struct rethook. * @rh: the struct rethook to be freed. -- cgit v1.2.3 From d0a3022f30629a208e5944022caeca3568add9e7 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 29 Jun 2023 23:50:48 +0000 Subject: tracing/user_events: Fix struct arg size match check When users register an event the name of the event and it's argument are checked to ensure they match if the event already exists. Normally all arguments are in the form of "type name", except for when the type starts with "struct ". In those cases, the size of the struct is passed in addition to the name, IE: "struct my_struct a 20" for an argument that is of type "struct my_struct" with a field name of "a" and has the size of 20 bytes. The current code does not honor the above case properly when comparing a match. This causes the event register to fail even when the same string was used for events that contain a struct argument within them. The example above "struct my_struct a 20" generates a match string of "struct my_struct a" omitting the size field. Add the struct size of the existing field when generating a comparison string for a struct field to ensure proper match checking. Link: https://lkml.kernel.org/r/20230629235049.581-2-beaub@linux.microsoft.com Cc: stable@vger.kernel.org Fixes: e6f89a149872 ("tracing/user_events: Ensure user provided strings are safely formatted") Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 4f5e74bbdab2..33cb6af31f39 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1317,6 +1317,9 @@ static int user_field_set_string(struct ftrace_event_field *field, pos += snprintf(buf + pos, LEN_OR_ZERO, " "); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + if (str_has_prefix(field->type, "struct ")) + pos += snprintf(buf + pos, LEN_OR_ZERO, " %d", field->size); + if (colon) pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); -- cgit v1.2.3 From c9e4bf607d8c431452ef362c00e62ce999bbae93 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 11 Jul 2023 13:48:12 +0200 Subject: PM: hibernate: Fix writing maj:min to /sys/power/resume resume_store() first calls lookup_bdev() and after tries to handle maj:min, but it does not reset the error before, hence if you will write maj:min you will get ENOENT: # echo 259:2 >| /sys/power/resume bash: echo: write error: No such file or directory This also should fix hiberation via systemd, since it uses this way. Fixes: 1e8c813b083c4 ("PM: hibernate: don't use early_lookup_bdev in resume_store") Signed-off-by: Azat Khuzhin Reviewed-by: Christoph Hellwig [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f62e89d0d906..e1b4bfa938dd 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1179,6 +1179,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, unsigned maj, min, offset; char *p, dummy; + error = 0; if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { -- cgit v1.2.3 From 3a8395b565b5b4f019b3dc182be4c4541eb35ac8 Mon Sep 17 00:00:00 2001 From: Chungkai Yang Date: Wed, 5 Jul 2023 16:59:07 +0800 Subject: PM: QoS: Restore support for default value on frequency QoS Commit 8d36694245f2 ("PM: QoS: Add check to make sure CPU freq is non-negative") makes sure CPU freq is non-negative to avoid negative value converting to unsigned data type. However, when the value is PM_QOS_DEFAULT_VALUE, pm_qos_update_target specifically uses c->default_value which is set to FREQ_QOS_MIN/MAX_DEFAULT_VALUE when cpufreq_policy_alloc is executed, for this case handling. Adding check for PM_QOS_DEFAULT_VALUE to let default setting work will fix this problem. Fixes: 8d36694245f2 ("PM: QoS: Add check to make sure CPU freq is non-negative") Link: https://lore.kernel.org/lkml/20230626035144.19717-1-Chung-kai.Yang@mediatek.com/ Link: https://lore.kernel.org/lkml/20230627071727.16646-1-Chung-kai.Yang@mediatek.com/ Link: https://lore.kernel.org/lkml/CAJZ5v0gxNOWhC58PHeUhW_tgf6d1fGJVZ1x91zkDdht11yUv-A@mail.gmail.com/ Signed-off-by: Chungkai Yang Cc: 6.0+ # 6.0+ Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index af51ed6d45ef..782d3b41c1f3 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -426,6 +426,11 @@ late_initcall(cpu_latency_qos_init); /* Definitions related to the frequency QoS below. */ +static inline bool freq_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + /** * freq_constraints_init - Initialize frequency QoS constraints. * @qos: Frequency QoS constraints to initialize. @@ -531,7 +536,7 @@ int freq_qos_add_request(struct freq_constraints *qos, { int ret; - if (IS_ERR_OR_NULL(qos) || !req || value < 0) + if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value)) return -EINVAL; if (WARN(freq_qos_request_active(req), @@ -563,7 +568,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request); */ int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) { - if (!req || new_value < 0) + if (!req || freq_qos_value_invalid(new_value)) return -EINVAL; if (WARN(!freq_qos_request_active(req), -- cgit v1.2.3 From 4369016497319a9635702da010d02af1ebb1849d Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 11 Jul 2023 19:58:48 +0800 Subject: bpf: cpumap: Fix memory leak in cpu_map_update_elem Syzkaller reported a memory leak as follows: BUG: memory leak unreferenced object 0xff110001198ef748 (size 192): comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s) hex dump (first 32 bytes): 00 00 00 00 4a 19 00 00 80 ad e3 e4 fe ff c0 00 ....J........... 00 b2 d3 0c 01 00 11 ff 28 f5 8e 19 01 00 11 ff ........(....... backtrace: [] __cpu_map_entry_alloc+0xf7/0xb00 [] cpu_map_update_elem+0x2fe/0x3d0 [] bpf_map_update_value.isra.0+0x2bd/0x520 [] map_update_elem+0x4cb/0x720 [] __se_sys_bpf+0x8c3/0xb90 [] do_syscall_64+0x30/0x40 [] entry_SYSCALL_64_after_hwframe+0x61/0xc6 BUG: memory leak unreferenced object 0xff110001198ef528 (size 192): comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __cpu_map_entry_alloc+0x260/0xb00 [] cpu_map_update_elem+0x2fe/0x3d0 [] bpf_map_update_value.isra.0+0x2bd/0x520 [] map_update_elem+0x4cb/0x720 [] __se_sys_bpf+0x8c3/0xb90 [] do_syscall_64+0x30/0x40 [] entry_SYSCALL_64_after_hwframe+0x61/0xc6 BUG: memory leak unreferenced object 0xff1100010fd93d68 (size 8): comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: [] kvmalloc_node+0x11e/0x170 [] __cpu_map_entry_alloc+0x2f0/0xb00 [] cpu_map_update_elem+0x2fe/0x3d0 [] bpf_map_update_value.isra.0+0x2bd/0x520 [] map_update_elem+0x4cb/0x720 [] __se_sys_bpf+0x8c3/0xb90 [] do_syscall_64+0x30/0x40 [] entry_SYSCALL_64_after_hwframe+0x61/0xc6 In the cpu_map_update_elem flow, when kthread_stop is called before calling the threadfn of rcpu->kthread, since the KTHREAD_SHOULD_STOP bit of kthread has been set by kthread_stop, the threadfn of rcpu->kthread will never be executed, and rcpu->refcnt will never be 0, which will lead to the allocated rcpu, rcpu->queue and rcpu->queue->queue cannot be released. Calling kthread_stop before executing kthread's threadfn will return -EINTR. We can complete the release of memory resources in this state. Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP") Signed-off-by: Pu Lehui Acked-by: Jesper Dangaard Brouer Acked-by: Hou Tao Link: https://lore.kernel.org/r/20230711115848.2701559-1-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8a33e8747a0e..6ae02be7a48e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -122,22 +122,6 @@ static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) atomic_inc(&rcpu->refcnt); } -/* called from workqueue, to workaround syscall using preempt_disable */ -static void cpu_map_kthread_stop(struct work_struct *work) -{ - struct bpf_cpu_map_entry *rcpu; - - rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); - - /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, - * as it waits until all in-flight call_rcu() callbacks complete. - */ - rcu_barrier(); - - /* kthread_stop will wake_up_process and wait for it to complete */ - kthread_stop(rcpu->kthread); -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -165,6 +149,30 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +/* called from workqueue, to workaround syscall using preempt_disable */ +static void cpu_map_kthread_stop(struct work_struct *work) +{ + struct bpf_cpu_map_entry *rcpu; + int err; + + rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + + /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, + * as it waits until all in-flight call_rcu() callbacks complete. + */ + rcu_barrier(); + + /* kthread_stop will wake_up_process and wait for it to complete */ + err = kthread_stop(rcpu->kthread); + if (err) { + /* kthread_stop may be called before cpu_map_kthread_run + * is executed, so we need to release the memory related + * to rcpu. + */ + put_cpu_map_entry(rcpu); + } +} + static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, struct list_head *listp, struct xdp_cpumap_stats *stats) -- cgit v1.2.3 From 7d8b31b73c79835572611ed1eed649e4d2e14245 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 14:51:48 +0200 Subject: tracing: arm64: Avoid missing-prototype warnings These are all tracing W=1 warnings in arm64 allmodconfig about missing prototypes: kernel/trace/trace_kprobe_selftest.c:7:5: error: no previous prototype for 'kprobe_trace_selftest_target' [-Werror=missing-pro totypes] kernel/trace/ftrace.c:329:5: error: no previous prototype for '__register_ftrace_function' [-Werror=missing-prototypes] kernel/trace/ftrace.c:372:5: error: no previous prototype for '__unregister_ftrace_function' [-Werror=missing-prototypes] kernel/trace/ftrace.c:4130:15: error: no previous prototype for 'arch_ftrace_match_adjust' [-Werror=missing-prototypes] kernel/trace/fgraph.c:243:15: error: no previous prototype for 'ftrace_return_to_handler' [-Werror=missing-prototypes] kernel/trace/fgraph.c:358:6: error: no previous prototype for 'ftrace_graph_sleep_time_control' [-Werror=missing-prototypes] arch/arm64/kernel/ftrace.c:460:6: error: no previous prototype for 'prepare_ftrace_return' [-Werror=missing-prototypes] arch/arm64/kernel/ptrace.c:2172:5: error: no previous prototype for 'syscall_trace_enter' [-Werror=missing-prototypes] arch/arm64/kernel/ptrace.c:2195:6: error: no previous prototype for 'syscall_trace_exit' [-Werror=missing-prototypes] Move the declarations to an appropriate header where they can be seen by the caller and callee, and make sure the headers are included where needed. Link: https://lore.kernel.org/linux-trace-kernel/20230517125215.930689-1-arnd@kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Will Deacon Cc: Kees Cook Cc: Florent Revest Signed-off-by: Arnd Bergmann Acked-by: Catalin Marinas [ Fixed ftrace_return_to_handler() to handle CONFIG_HAVE_FUNCTION_GRAPH_RETVAL case ] Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 1 + kernel/trace/ftrace_internal.h | 5 +++-- kernel/trace/trace_kprobe_selftest.c | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index cd2c35b1dd8f..c83c005e654e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -15,6 +15,7 @@ #include #include "ftrace_internal.h" +#include "trace.h" #ifdef CONFIG_DYNAMIC_FTRACE #define ASSIGN_OPS_HASH(opsname, val) \ diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 382775edf690..5012c04f92c0 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -2,6 +2,9 @@ #ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H #define _LINUX_KERNEL_FTRACE_INTERNAL_H +int __register_ftrace_function(struct ftrace_ops *ops); +int __unregister_ftrace_function(struct ftrace_ops *ops); + #ifdef CONFIG_FUNCTION_TRACER extern struct mutex ftrace_lock; @@ -15,8 +18,6 @@ int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs); #else /* !CONFIG_DYNAMIC_FTRACE */ -int __register_ftrace_function(struct ftrace_ops *ops); -int __unregister_ftrace_function(struct ftrace_ops *ops); /* Keep as macros so we do not need to define the commands */ # define ftrace_startup(ops, command) \ ({ \ diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c index 16548ee4c8c6..3851cd1e6a62 100644 --- a/kernel/trace/trace_kprobe_selftest.c +++ b/kernel/trace/trace_kprobe_selftest.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 + +#include "trace_kprobe_selftest.h" + /* * Function used during the kprobe self test. This function is in a separate * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it -- cgit v1.2.3 From 7e42907f3a7b4ce3a2d1757f6d78336984daf8f5 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Sun, 9 Jul 2023 06:51:44 +0800 Subject: ring-buffer: Fix deadloop issue on reading trace_pipe Soft lockup occurs when reading file 'trace_pipe': watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [cat:4488] [...] RIP: 0010:ring_buffer_empty_cpu+0xed/0x170 RSP: 0018:ffff88810dd6fc48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000246 RCX: ffffffff93d1aaeb RDX: ffff88810a280040 RSI: 0000000000000008 RDI: ffff88811164b218 RBP: ffff88811164b218 R08: 0000000000000000 R09: ffff88815156600f R10: ffffed102a2acc01 R11: 0000000000000001 R12: 0000000051651901 R13: 0000000000000000 R14: ffff888115e49500 R15: 0000000000000000 [...] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8d853c2000 CR3: 000000010dcd8000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __find_next_entry+0x1a8/0x4b0 ? peek_next_entry+0x250/0x250 ? down_write+0xa5/0x120 ? down_write_killable+0x130/0x130 trace_find_next_entry_inc+0x3b/0x1d0 tracing_read_pipe+0x423/0xae0 ? tracing_splice_read_pipe+0xcb0/0xcb0 vfs_read+0x16b/0x490 ksys_read+0x105/0x210 ? __ia32_sys_pwrite64+0x200/0x200 ? switch_fpu_return+0x108/0x220 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Through the vmcore, I found it's because in tracing_read_pipe(), ring_buffer_empty_cpu() found some buffer is not empty but then it cannot read anything due to "rb_num_of_entries() == 0" always true, Then it infinitely loop the procedure due to user buffer not been filled, see following code path: tracing_read_pipe() { ... ... waitagain: tracing_wait_pipe() // 1. find non-empty buffer here trace_find_next_entry_inc() // 2. loop here try to find an entry __find_next_entry() ring_buffer_empty_cpu(); // 3. find non-empty buffer peek_next_entry() // 4. but peek always return NULL ring_buffer_peek() rb_buffer_peek() rb_get_reader_page() // 5. because rb_num_of_entries() == 0 always true here // then return NULL // 6. user buffer not been filled so goto 'waitgain' // and eventually leads to an deadloop in kernel!!! } By some analyzing, I found that when resetting ringbuffer, the 'entries' of its pages are not all cleared (see rb_reset_cpu()). Then when reducing the ringbuffer, and if some reduced pages exist dirty 'entries' data, they will be added into 'cpu_buffer->overrun' (see rb_remove_pages()), which cause wrong 'overrun' count and eventually cause the deadloop issue. To fix it, we need to clear every pages in rb_reset_cpu(). Link: https://lore.kernel.org/linux-trace-kernel/20230708225144.3785600-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: a5fb833172eca ("ring-buffer: Fix uninitialized read_stamp") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 834b361a4a66..14d8001140c8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5242,28 +5242,34 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_size); +static void rb_clear_buffer_page(struct buffer_page *page) +{ + local_set(&page->write, 0); + local_set(&page->entries, 0); + rb_init_page(page->page); + page->read = 0; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + struct buffer_page *page; + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); - local_set(&cpu_buffer->head_page->write, 0); - local_set(&cpu_buffer->head_page->entries, 0); - local_set(&cpu_buffer->head_page->page->commit, 0); - - cpu_buffer->head_page->read = 0; + rb_clear_buffer_page(cpu_buffer->head_page); + list_for_each_entry(page, cpu_buffer->pages, list) { + rb_clear_buffer_page(page); + } cpu_buffer->tail_page = cpu_buffer->head_page; cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); - cpu_buffer->reader_page->read = 0; + rb_clear_buffer_page(cpu_buffer->reader_page); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); -- cgit v1.2.3 From 26efd79c4624294e553aeaa3439c646729bad084 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 12 Jul 2023 14:04:52 +0800 Subject: ftrace: Fix possible warning on checking all pages used in ftrace_process_locs() As comments in ftrace_process_locs(), there may be NULL pointers in mcount_loc section: > Some architecture linkers will pad between > the different mcount_loc sections of different > object files to satisfy alignments. > Skip any NULL pointers. After commit 20e5227e9f55 ("ftrace: allow NULL pointers in mcount_loc"), NULL pointers will be accounted when allocating ftrace pages but skipped before adding into ftrace pages, this may result in some pages not being used. Then after commit 706c81f87f84 ("ftrace: Remove extra helper functions"), warning may occur at: WARN_ON(pg->next); To fix it, only warn for case that no pointers skipped but pages not used up, then free those unused pages after releasing ftrace_lock. Link: https://lore.kernel.org/linux-trace-kernel/20230712060452.3175675-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: 706c81f87f84 ("ftrace: Remove extra helper functions") Suggested-by: Steven Rostedt Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3740aca79fe7..05c0024815bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3305,6 +3305,22 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) return cnt; } +static void ftrace_free_pages(struct ftrace_page *pages) +{ + struct ftrace_page *pg = pages; + + while (pg) { + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } + pages = pg->next; + kfree(pg); + pg = pages; + ftrace_number_of_groups--; + } +} + static struct ftrace_page * ftrace_allocate_pages(unsigned long num_to_init) { @@ -3343,17 +3359,7 @@ ftrace_allocate_pages(unsigned long num_to_init) return start_pg; free_pages: - pg = start_pg; - while (pg) { - if (pg->records) { - free_pages((unsigned long)pg->records, pg->order); - ftrace_number_of_pages -= 1 << pg->order; - } - start_pg = pg->next; - kfree(pg); - pg = start_pg; - ftrace_number_of_groups--; - } + ftrace_free_pages(start_pg); pr_info("ftrace: FAILED to allocate memory for functions\n"); return NULL; } @@ -6471,9 +6477,11 @@ static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *pg_unuse = NULL; struct ftrace_page *start_pg; struct ftrace_page *pg; struct dyn_ftrace *rec; + unsigned long skipped = 0; unsigned long count; unsigned long *p; unsigned long addr; @@ -6536,8 +6544,10 @@ static int ftrace_process_locs(struct module *mod, * object files to satisfy alignments. * Skip any NULL pointers. */ - if (!addr) + if (!addr) { + skipped++; continue; + } end_offset = (pg->index+1) * sizeof(pg->records[0]); if (end_offset > PAGE_SIZE << pg->order) { @@ -6551,8 +6561,10 @@ static int ftrace_process_locs(struct module *mod, rec->ip = addr; } - /* We should have used all pages */ - WARN_ON(pg->next); + if (pg->next) { + pg_unuse = pg->next; + pg->next = NULL; + } /* Assign the last page to ftrace_pages */ ftrace_pages = pg; @@ -6574,6 +6586,11 @@ static int ftrace_process_locs(struct module *mod, out: mutex_unlock(&ftrace_lock); + /* We should have used all pages unless we skipped some */ + if (pg_unuse) { + WARN_ON(!skipped); + ftrace_free_pages(pg_unuse); + } return ret; } -- cgit v1.2.3 From bec3c25c247c4f88a33d79675a09e1644c9a3114 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jul 2023 10:52:35 -0400 Subject: tracing: Stop FORTIFY_SOURCE complaining about stack trace caller The stack_trace event is an event created by the tracing subsystem to store stack traces. It originally just contained a hard coded array of 8 words to hold the stack, and a "size" to know how many entries are there. This is exported to user space as: name: kernel_stack ID: 4 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:int size; offset:8; size:4; signed:1; field:unsigned long caller[8]; offset:16; size:64; signed:0; print fmt: "\t=> %ps\n\t=> %ps\n\t=> %ps\n" "\t=> %ps\n\t=> %ps\n\t=> %ps\n" "\t=> %ps\n\t=> %ps\n",i (void *)REC->caller[0], (void *)REC->caller[1], (void *)REC->caller[2], (void *)REC->caller[3], (void *)REC->caller[4], (void *)REC->caller[5], (void *)REC->caller[6], (void *)REC->caller[7] Where the user space tracers could parse the stack. The library was updated for this specific event to only look at the size, and not the array. But some older users still look at the array (note, the older code still checks to make sure the array fits inside the event that it read. That is, if only 4 words were saved, the parser would not read the fifth word because it will see that it was outside of the event size). This event was changed a while ago to be more dynamic, and would save a full stack even if it was greater than 8 words. It does this by simply allocating more ring buffer to hold the extra words. Then it copies in the stack via: memcpy(&entry->caller, fstack->calls, size); As the entry is struct stack_entry, that is created by a macro to both create the structure and export this to user space, it still had the caller field of entry defined as: unsigned long caller[8]. When the stack is greater than 8, the FORTIFY_SOURCE code notices that the amount being copied is greater than the source array and complains about it. It has no idea that the source is pointing to the ring buffer with the required allocation. To hide this from the FORTIFY_SOURCE logic, pointer arithmetic is used: ptr = ring_buffer_event_data(event); entry = ptr; ptr += offsetof(typeof(*entry), caller); memcpy(ptr, fstack->calls, size); Link: https://lore.kernel.org/all/20230612160748.4082850-1-svens@linux.ibm.com/ Link: https://lore.kernel.org/linux-trace-kernel/20230712105235.5fc441aa@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Reported-by: Sven Schnelle Tested-by: Sven Schnelle Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4529e264cb86..20122eeccf97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3118,6 +3118,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; + void *ptr; /* * Add one, for this function and the call to save_stack_trace() @@ -3161,9 +3162,25 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, trace_ctx); if (!event) goto out; - entry = ring_buffer_event_data(event); + ptr = ring_buffer_event_data(event); + entry = ptr; + + /* + * For backward compatibility reasons, the entry->caller is an + * array of 8 slots to store the stack. This is also exported + * to user space. The amount allocated on the ring buffer actually + * holds enough for the stack specified by nr_entries. This will + * go into the location of entry->caller. Due to string fortifiers + * checking the size of the destination of memcpy() it triggers + * when it detects that size is greater than 8. To hide this from + * the fortifiers, we use "ptr" and pointer arithmetic to assign caller. + * + * The below is really just: + * memcpy(&entry->caller, fstack->calls, size); + */ + ptr += offsetof(typeof(*entry), caller); + memcpy(ptr, fstack->calls, size); - memcpy(&entry->caller, fstack->calls, size); entry->size = nr_entries; if (!call_filter_check_discard(call, entry, buffer, event)) -- cgit v1.2.3 From 8cc32a9bbf2934d90762d9de0187adcb5ad46a11 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 28 Jun 2023 11:19:26 -0700 Subject: kallsyms: strip LTO-only suffixes from promoted global functions Commit 6eb4bd92c1ce ("kallsyms: strip LTO suffixes from static functions") stripped all function/variable suffixes started with '.' regardless of whether those suffixes are generated at LTO mode or not. In fact, as far as I know, in LTO mode, when a static function/variable is promoted to the global scope, '.llvm.<...>' suffix is added. The existing mechanism breaks live patch for a LTO kernel even if no .llvm.<...> symbols are involved. For example, for the following kernel symbols: $ grep bpf_verifier_vlog /proc/kallsyms ffffffff81549f60 t bpf_verifier_vlog ffffffff8268b430 d bpf_verifier_vlog._entry ffffffff8282a958 d bpf_verifier_vlog._entry_ptr ffffffff82e12a1f d bpf_verifier_vlog.__already_done 'bpf_verifier_vlog' is a static function. '_entry', '_entry_ptr' and '__already_done' are static variables used inside 'bpf_verifier_vlog', so llvm promotes them to file-level static with prefix 'bpf_verifier_vlog.'. Note that the func-level to file-level static function promotion also happens without LTO. Given a symbol name 'bpf_verifier_vlog', with LTO kernel, current mechanism will return 4 symbols to live patch subsystem which current live patching subsystem cannot handle it. With non-LTO kernel, only one symbol is returned. In [1], we have a lengthy discussion, the suggestion is to separate two cases: (1). new symbols with suffix which are generated regardless of whether LTO is enabled or not, and (2). new symbols with suffix generated only when LTO is enabled. The cleanup_symbol_name() should only remove suffixes for case (2). Case (1) should not be changed so it can work uniformly with or without LTO. This patch removed LTO-only suffix '.llvm.<...>' so live patching and tracing should work the same way for non-LTO kernel. The cleanup_symbol_name() in scripts/kallsyms.c is also changed to have the same filtering pattern so both kernel and kallsyms tool have the same expectation on the order of symbols. [1] https://lore.kernel.org/live-patching/20230615170048.2382735-1-song@kernel.org/T/#u Fixes: 6eb4bd92c1ce ("kallsyms: strip LTO suffixes from static functions") Reported-by: Song Liu Signed-off-by: Yonghong Song Reviewed-by: Zhen Lei Reviewed-by: Nick Desaulniers Acked-by: Song Liu Link: https://lore.kernel.org/r/20230628181926.4102448-1-yhs@fb.com Signed-off-by: Kees Cook --- kernel/kallsyms.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 7982cc9d497c..016d997131d4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -174,11 +174,10 @@ static bool cleanup_symbol_name(char *s) * LLVM appends various suffixes for local functions and variables that * must be promoted to global scope as part of LTO. This can break * hooking of static functions with kprobes. '.' is not a valid - * character in an identifier in C. Suffixes observed: + * character in an identifier in C. Suffixes only in LLVM LTO observed: * - foo.llvm.[0-9a-f]+ - * - foo.[0-9a-f]+ */ - res = strchr(s, '.'); + res = strstr(s, ".llvm."); if (res) { *res = '\0'; return true; -- cgit v1.2.3 From 6018b585e8c6fa7d85d4b38d9ce49a5b67be7078 Mon Sep 17 00:00:00 2001 From: Mohamed Khalfella Date: Wed, 12 Jul 2023 22:30:21 +0000 Subject: tracing/histograms: Add histograms to hist_vars if they have referenced variables Hist triggers can have referenced variables without having direct variables fields. This can be the case if referenced variables are added for trigger actions. In this case the newly added references will not have field variables. Not taking such referenced variables into consideration can result in a bug where it would be possible to remove hist trigger with variables being refenced. This will result in a bug that is easily reproducable like so $ cd /sys/kernel/tracing $ echo 'synthetic_sys_enter char[] comm; long id' >> synthetic_events $ echo 'hist:keys=common_pid.execname,id.syscall:vals=hitcount:comm=common_pid.execname' >> events/raw_syscalls/sys_enter/trigger $ echo 'hist:keys=common_pid.execname,id.syscall:onmatch(raw_syscalls.sys_enter).synthetic_sys_enter($comm, id)' >> events/raw_syscalls/sys_enter/trigger $ echo '!hist:keys=common_pid.execname,id.syscall:vals=hitcount:comm=common_pid.execname' >> events/raw_syscalls/sys_enter/trigger [ 100.263533] ================================================================== [ 100.264634] BUG: KASAN: slab-use-after-free in resolve_var_refs+0xc7/0x180 [ 100.265520] Read of size 8 at addr ffff88810375d0f0 by task bash/439 [ 100.266320] [ 100.266533] CPU: 2 PID: 439 Comm: bash Not tainted 6.5.0-rc1 #4 [ 100.267277] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-20220807_005459-localhost 04/01/2014 [ 100.268561] Call Trace: [ 100.268902] [ 100.269189] dump_stack_lvl+0x4c/0x70 [ 100.269680] print_report+0xc5/0x600 [ 100.270165] ? resolve_var_refs+0xc7/0x180 [ 100.270697] ? kasan_complete_mode_report_info+0x80/0x1f0 [ 100.271389] ? resolve_var_refs+0xc7/0x180 [ 100.271913] kasan_report+0xbd/0x100 [ 100.272380] ? resolve_var_refs+0xc7/0x180 [ 100.272920] __asan_load8+0x71/0xa0 [ 100.273377] resolve_var_refs+0xc7/0x180 [ 100.273888] event_hist_trigger+0x749/0x860 [ 100.274505] ? kasan_save_stack+0x2a/0x50 [ 100.275024] ? kasan_set_track+0x29/0x40 [ 100.275536] ? __pfx_event_hist_trigger+0x10/0x10 [ 100.276138] ? ksys_write+0xd1/0x170 [ 100.276607] ? do_syscall_64+0x3c/0x90 [ 100.277099] ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 100.277771] ? destroy_hist_data+0x446/0x470 [ 100.278324] ? event_hist_trigger_parse+0xa6c/0x3860 [ 100.278962] ? __pfx_event_hist_trigger_parse+0x10/0x10 [ 100.279627] ? __kasan_check_write+0x18/0x20 [ 100.280177] ? mutex_unlock+0x85/0xd0 [ 100.280660] ? __pfx_mutex_unlock+0x10/0x10 [ 100.281200] ? kfree+0x7b/0x120 [ 100.281619] ? ____kasan_slab_free+0x15d/0x1d0 [ 100.282197] ? event_trigger_write+0xac/0x100 [ 100.282764] ? __kasan_slab_free+0x16/0x20 [ 100.283293] ? __kmem_cache_free+0x153/0x2f0 [ 100.283844] ? sched_mm_cid_remote_clear+0xb1/0x250 [ 100.284550] ? __pfx_sched_mm_cid_remote_clear+0x10/0x10 [ 100.285221] ? event_trigger_write+0xbc/0x100 [ 100.285781] ? __kasan_check_read+0x15/0x20 [ 100.286321] ? __bitmap_weight+0x66/0xa0 [ 100.286833] ? _find_next_bit+0x46/0xe0 [ 100.287334] ? task_mm_cid_work+0x37f/0x450 [ 100.287872] event_triggers_call+0x84/0x150 [ 100.288408] trace_event_buffer_commit+0x339/0x430 [ 100.289073] ? ring_buffer_event_data+0x3f/0x60 [ 100.292189] trace_event_raw_event_sys_enter+0x8b/0xe0 [ 100.295434] syscall_trace_enter.constprop.0+0x18f/0x1b0 [ 100.298653] syscall_enter_from_user_mode+0x32/0x40 [ 100.301808] do_syscall_64+0x1a/0x90 [ 100.304748] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 100.307775] RIP: 0033:0x7f686c75c1cb [ 100.310617] Code: 73 01 c3 48 8b 0d 65 3c 10 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 21 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 35 3c 10 00 f7 d8 64 89 01 48 [ 100.317847] RSP: 002b:00007ffc60137a38 EFLAGS: 00000246 ORIG_RAX: 0000000000000021 [ 100.321200] RAX: ffffffffffffffda RBX: 000055f566469ea0 RCX: 00007f686c75c1cb [ 100.324631] RDX: 0000000000000001 RSI: 0000000000000001 RDI: 000000000000000a [ 100.328104] RBP: 00007ffc60137ac0 R08: 00007f686c818460 R09: 000000000000000a [ 100.331509] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000009 [ 100.334992] R13: 0000000000000007 R14: 000000000000000a R15: 0000000000000007 [ 100.338381] We hit the bug because when second hist trigger has was created has_hist_vars() returned false because hist trigger did not have variables. As a result of that save_hist_vars() was not called to add the trigger to trace_array->hist_vars. Later on when we attempted to remove the first histogram find_any_var_ref() failed to detect it is being used because it did not find the second trigger in hist_vars list. With this change we wait until trigger actions are created so we can take into consideration if hist trigger has variable references. Also, now we check the return value of save_hist_vars() and fail trigger creation if save_hist_vars() fails. Link: https://lore.kernel.org/linux-trace-kernel/20230712223021.636335-1-mkhalfella@purestorage.com Cc: stable@vger.kernel.org Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers") Signed-off-by: Mohamed Khalfella Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b97d3ad832f1..c8c61381eba4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6663,13 +6663,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (get_named_trigger_data(trigger_data)) goto enable; - if (has_hist_vars(hist_data)) - save_hist_vars(hist_data); - ret = create_actions(hist_data); if (ret) goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + if (save_hist_vars(hist_data)) + goto out_unreg; + } + ret = tracing_map_init(hist_data->map); if (ret) goto out_unreg; -- cgit v1.2.3 From d5a821896360cc8b93a15bd888fabc858c038dc0 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 13 Jul 2023 22:14:35 +0800 Subject: tracing: Fix memory leak of iter->temp when reading trace_pipe kmemleak reports: unreferenced object 0xffff88814d14e200 (size 256): comm "cat", pid 336, jiffies 4294871818 (age 779.490s) hex dump (first 32 bytes): 04 00 01 03 00 00 00 00 08 00 00 00 00 00 00 00 ................ 0c d8 c8 9b ff ff ff ff 04 5a ca 9b ff ff ff ff .........Z...... backtrace: [] __kmalloc+0x4f/0x140 [] trace_find_next_entry+0xbb/0x1d0 [] trace_print_lat_context+0xaf/0x4e0 [] print_trace_line+0x3e0/0x950 [] tracing_read_pipe+0x2d9/0x5a0 [] vfs_read+0x143/0x520 [] ksys_read+0xbd/0x160 [] do_syscall_64+0x3f/0x90 [] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 when reading file 'trace_pipe', 'iter->temp' is allocated or relocated in trace_find_next_entry() but not freed before 'trace_pipe' is closed. To fix it, free 'iter->temp' in tracing_release_pipe(). Link: https://lore.kernel.org/linux-trace-kernel/20230713141435.1133021-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: ff895103a84ab ("tracing: Save off entry when peeking at next entry") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 20122eeccf97..be847d45d81c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6781,6 +6781,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) free_cpumask_var(iter->started); kfree(iter->fmt); + kfree(iter->temp); mutex_destroy(&iter->mutex); kfree(iter); -- cgit v1.2.3 From d5f28bb1ce04636b285726ee2a5afa3a514025f4 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 8 Jul 2023 01:38:03 +0900 Subject: fprobes: Add a comment why fprobe_kprobe_handler exits if kprobe is running Add a comment the reason why fprobe_kprobe_handler() exits if any other kprobe is running. Link: https://lore.kernel.org/all/168874788299.159442.2485957441413653858.stgit@devnote2/ Suggested-by: Steven Rostedt Link: https://lore.kernel.org/all/20230706120916.3c6abf15@gandalf.local.home/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/fprobe.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 2571f7f3d5f2..59321d22f43e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -100,6 +100,12 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, return; } + /* + * This user handler is shared with other kprobes and is not expected to be + * called recursively. So if any other kprobe handler is running, this will + * exit as kprobe does. See the section 'Share the callbacks with kprobes' + * in Documentation/trace/fprobe.rst for more information. + */ if (unlikely(kprobe_running())) { fp->nmissed++; goto recursion_unlock; -- cgit v1.2.3 From 66bcf65d6cf0ca6540e2341e88ee7ef02dbdda08 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:29 +0900 Subject: tracing/probes: Fix to avoid double count of the string length on the array If an array is specified with the ustring or symstr, the length of the strings are accumlated on both of 'ret' and 'total', which means the length is double counted. Just set the length to the 'ret' value for avoiding double counting. Link: https://lore.kernel.org/all/168908492917.123124.15076463491122036025.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/8819b154-2ba1-43c3-98a2-cbde20892023@moroto.mountain/ Fixes: 88903c464321 ("tracing/probe: Add ustring type for user-space string") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 00707630788d..4735c5cb76fa 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -156,11 +156,11 @@ stage3: code++; goto array; case FETCH_OP_ST_USTRING: - ret += fetch_store_strlen_user(val + code->offset); + ret = fetch_store_strlen_user(val + code->offset); code++; goto array; case FETCH_OP_ST_SYMSTR: - ret += fetch_store_symstrlen(val + code->offset); + ret = fetch_store_symstrlen(val + code->offset); code++; goto array; default: -- cgit v1.2.3 From b41326b5e0f82e93592c4366359917b5d67b529f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:38 +0900 Subject: tracing/probes: Fix not to count error code to total length Fix not to count the error code (which is minus value) to the total used length of array, because it can mess up the return code of process_fetch_insn_bottom(). Also clear the 'ret' value because it will be used for calculating next data_loc entry. Link: https://lore.kernel.org/all/168908493827.123124.2175257289106364229.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/8819b154-2ba1-43c3-98a2-cbde20892023@moroto.mountain/ Fixes: 9b960a38835f ("tracing: probeevent: Unify fetch_insn processing common part") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4735c5cb76fa..ed9d57c6b041 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -204,6 +204,8 @@ stage3: array: /* the last stage: Loop on array */ if (code->op == FETCH_OP_LP_ARRAY) { + if (ret < 0) + ret = 0; total += ret; if (++i < code->param) { code = s3; -- cgit v1.2.3 From e38e2c6a9efc435f9de344b7c91f7697e01b47d5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:48 +0900 Subject: tracing/probes: Fix to update dynamic data counter if fetcharg uses it Fix to update dynamic data counter ('dyndata') and max length ('maxlen') only if the fetcharg uses the dynamic data. Also get out arg->dynamic from unlikely(). This makes dynamic data address wrong if process_fetch_insn() returns error on !arg->dynamic case. Link: https://lore.kernel.org/all/168908494781.123124.8160245359962103684.stgit@devnote2/ Suggested-by: Steven Rostedt Link: https://lore.kernel.org/all/20230710233400.5aaf024e@gandalf.local.home/ Fixes: 9178412ddf5a ("tracing: probeevent: Return consumed bytes of dynamic area") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index ed9d57c6b041..185da001f4c3 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -267,11 +267,13 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) { - *dl = make_data_loc(0, dyndata - base); - } else { - dyndata += ret; - maxlen -= ret; + if (arg->dynamic) { + if (unlikely(ret < 0)) { + *dl = make_data_loc(0, dyndata - base); + } else { + dyndata += ret; + maxlen -= ret; + } } } } -- cgit v1.2.3 From 4ed8f337dee32df71435689c19d22e4ee846e15a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:57 +0900 Subject: Revert "tracing: Add "(fault)" name injection to kernel probes" This reverts commit 2e9906f84fc7c99388bb7123ade167250d50f1c0. It was turned out that commit 2e9906f84fc7 ("tracing: Add "(fault)" name injection to kernel probes") did not work correctly and probe events still show just '(fault)' (instead of '"(fault)"'). Also, current '(fault)' is more explicit that it faulted. This also moves FAULT_STRING macro to trace.h so that synthetic event can keep using it, and uses it in trace_probe.c too. Link: https://lore.kernel.org/all/168908495772.123124.1250788051922100079.stgit@devnote2/ Link: https://lore.kernel.org/all/20230706230642.3793a593@rorschach.local.home/ Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_probe.c | 2 +- kernel/trace/trace_probe_kernel.h | 31 ++++++------------------------- 3 files changed, 9 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..eee1f3ca4749 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -113,6 +113,8 @@ enum trace_type { #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define FAULT_STRING "(fault)" + #define HIST_STACKTRACE_DEPTH 16 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..591399ddcee5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -65,7 +65,7 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) int len = *(u32 *)data >> 16; if (!len) - trace_seq_puts(s, "(fault)"); + trace_seq_puts(s, FAULT_STRING); else trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index c4e1d4c03a85..6deae2ce34f8 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -2,8 +2,6 @@ #ifndef __TRACE_PROBE_KERNEL_H_ #define __TRACE_PROBE_KERNEL_H_ -#define FAULT_STRING "(fault)" - /* * This depends on trace_probe.h, but can not include it due to * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. @@ -15,16 +13,8 @@ static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; - int ret; - ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); - /* - * strnlen_user_nofault returns zero on fault, insert the - * FAULT_STRING when that occurs. - */ - if (ret <= 0) - return strlen(FAULT_STRING) + 1; - return ret; + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); } /* Return the length of string -- including null terminal byte */ @@ -44,18 +34,7 @@ fetch_store_strlen(unsigned long addr) len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - /* For faults, return enough to hold the FAULT_STRING */ - return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; -} - -static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) -{ - if (ret >= 0) { - *(u32 *)dest = make_data_loc(ret, __dest - base); - } else { - strscpy(__dest, FAULT_STRING, len); - ret = strlen(__dest) + 1; - } + return (ret < 0) ? ret : len; } /* @@ -76,7 +55,8 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } @@ -107,7 +87,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } -- cgit v1.2.3 From 797311bce5c2ac90b8d65e357603cfd410d36ebb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:16:07 +0900 Subject: tracing/probes: Fix to record 0-length data_loc in fetch_store_string*() if fails Fix to record 0-length data to data_loc in fetch_store_string*() if it fails to get the string data. Currently those expect that the data_loc is updated by store_trace_args() if it returns the error code. However, that does not work correctly if the argument is an array of strings. In that case, store_trace_args() only clears the first entry of the array (which may have no error) and leaves other entries. So it should be cleared by fetch_store_string*() itself. Also, 'dyndata' and 'maxlen' in store_trace_args() should be updated only if it is used (ret > 0 and argument is a dynamic data.) Link: https://lore.kernel.org/all/168908496683.123124.4761206188794205601.stgit@devnote2/ Fixes: 40b53b771806 ("tracing: probeevent: Add array type support") Cc: stable@vger.kernel.org Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe_kernel.h | 13 +++++++++---- kernel/trace/trace_probe_tmpl.h | 10 +++------- kernel/trace/trace_uprobe.c | 3 ++- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index 6deae2ce34f8..bb723eefd7b7 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -37,6 +37,13 @@ fetch_store_strlen(unsigned long addr) return (ret < 0) ? ret : len; } +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base) +{ + if (ret < 0) + ret = 0; + *(u32 *)dest = make_data_loc(ret, __dest - base); +} + /* * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf * with max length and relative data location. @@ -55,8 +62,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base); return ret; } @@ -87,8 +93,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base); return ret; } diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 185da001f4c3..3935b347f874 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -267,13 +267,9 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (arg->dynamic) { - if (unlikely(ret < 0)) { - *dl = make_data_loc(0, dyndata - base); - } else { - dyndata += ret; - maxlen -= ret; - } + if (arg->dynamic && likely(ret > 0)) { + dyndata += ret; + maxlen -= ret; } } } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b92e34ff0c8..7b47e9a2c010 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -170,7 +170,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) */ ret++; *(u32 *)dest = make_data_loc(ret, (void *)dst - base); - } + } else + *(u32 *)dest = make_data_loc(0, (void *)dst - base); return ret; } -- cgit v1.2.3 From f7853c34241807bb97673a5e97719123be39a09e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Jul 2023 16:19:09 +0200 Subject: locking/rtmutex: Fix task->pi_waiters integrity Henry reported that rt_mutex_adjust_prio_check() has an ordering problem and puts the lie to the comment in [7]. Sharing the sort key between lock->waiters and owner->pi_waiters *does* create problems, since unlike what the comment claims, holding [L] is insufficient. Notably, consider: A / \ M1 M2 | | B C That is, task A owns both M1 and M2, B and C block on them. In this case a concurrent chain walk (B & C) will modify their resp. sort keys in [7] while holding M1->wait_lock and M2->wait_lock. So holding [L] is meaningless, they're different Ls. This then gives rise to a race condition between [7] and [11], where the requeue of pi_waiters will observe an inconsistent tree order. B C (holds M1->wait_lock, (holds M2->wait_lock, holds B->pi_lock) holds A->pi_lock) [7] waiter_update_prio(); ... [8] raw_spin_unlock(B->pi_lock); ... [10] raw_spin_lock(A->pi_lock); [11] rt_mutex_enqueue_pi(); // observes inconsistent A->pi_waiters // tree order Fixing this means either extending the range of the owner lock from [10-13] to [6-13], with the immediate problem that this means [6-8] hold both blocked and owner locks, or duplicating the sort key. Since the locking in chain walk is horrible enough without having to consider pi_lock nesting rules, duplicate the sort key instead. By giving each tree their own sort key, the above race becomes harmless, if C sees B at the old location, then B will correct things (if they need correcting) when it walks up the chain and reaches A. Fixes: fb00aca47440 ("rtmutex: Turn the plist into an rb-tree") Reported-by: Henry Wu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Tested-by: Henry Wu Link: https://lkml.kernel.org/r/20230707161052.GF2883469%40hirez.programming.kicks-ass.net --- kernel/locking/rtmutex.c | 170 +++++++++++++++++++++++++++------------- kernel/locking/rtmutex_api.c | 2 +- kernel/locking/rtmutex_common.h | 47 ++++++++--- kernel/locking/ww_mutex.h | 12 +-- 4 files changed, 155 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 728f434de2bb..21db0df0eb00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -333,21 +333,43 @@ static __always_inline int __waiter_prio(struct task_struct *task) return prio; } +/* + * Update the waiter->tree copy of the sort keys. + */ static __always_inline void waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) { - waiter->prio = __waiter_prio(task); - waiter->deadline = task->dl.deadline; + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); + + waiter->tree.prio = __waiter_prio(task); + waiter->tree.deadline = task->dl.deadline; +} + +/* + * Update the waiter->pi_tree copy of the sort keys (from the tree copy). + */ +static __always_inline void +waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert_held(&task->pi_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); + + waiter->pi_tree.prio = waiter->tree.prio; + waiter->pi_tree.deadline = waiter->tree.deadline; } /* - * Only use with rt_mutex_waiter_{less,equal}() + * Only use with rt_waiter_node_{less,equal}() */ +#define task_to_waiter_node(p) \ + &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } -static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio < right->prio) return 1; @@ -364,8 +386,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, return 0; } -static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio != right->prio) return 0; @@ -385,7 +407,7 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *top_waiter) { - if (rt_mutex_waiter_less(waiter, top_waiter)) + if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) return true; #ifdef RT_MUTEX_BUILD_SPINLOCKS @@ -393,30 +415,30 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, * Note that RT tasks are excluded from same priority (lateral) * steals to prevent the introduction of an unbounded latency. */ - if (rt_prio(waiter->prio) || dl_prio(waiter->prio)) + if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) return false; - return rt_mutex_waiter_equal(waiter, top_waiter); + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); #else return false; #endif } #define __node_2_waiter(node) \ - rb_entry((node), struct rt_mutex_waiter, tree_entry) + rb_entry((node), struct rt_mutex_waiter, tree.entry) static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) { struct rt_mutex_waiter *aw = __node_2_waiter(a); struct rt_mutex_waiter *bw = __node_2_waiter(b); - if (rt_mutex_waiter_less(aw, bw)) + if (rt_waiter_node_less(&aw->tree, &bw->tree)) return 1; if (!build_ww_mutex()) return 0; - if (rt_mutex_waiter_less(bw, aw)) + if (rt_waiter_node_less(&bw->tree, &aw->tree)) return 0; /* NOTE: relies on waiter->ww_ctx being set before insertion */ @@ -434,48 +456,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod static __always_inline void rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); + lockdep_assert_held(&lock->wait_lock); + + rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); } static __always_inline void rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->tree_entry)) + lockdep_assert_held(&lock->wait_lock); + + if (RB_EMPTY_NODE(&waiter->tree.entry)) return; - rb_erase_cached(&waiter->tree_entry, &lock->waiters); - RB_CLEAR_NODE(&waiter->tree_entry); + rb_erase_cached(&waiter->tree.entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree.entry); } -#define __node_2_pi_waiter(node) \ - rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) +#define __node_2_rt_node(node) \ + rb_entry((node), struct rt_waiter_node, entry) -static __always_inline bool -__pi_waiter_less(struct rb_node *a, const struct rb_node *b) +static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) { - return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); + return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); } static __always_inline void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); + lockdep_assert_held(&task->pi_lock); + + rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); } static __always_inline void rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + lockdep_assert_held(&task->pi_lock); + + if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) return; - rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); - RB_CLEAR_NODE(&waiter->pi_tree_entry); + rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree.entry); } -static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) +static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, + struct task_struct *p) { struct task_struct *pi_task = NULL; + lockdep_assert_held(&lock->wait_lock); + lockdep_assert(rt_mutex_owner(lock) == p); lockdep_assert_held(&p->pi_lock); if (task_has_pi_waiters(p)) @@ -571,9 +603,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st * Chain walk basics and protection scope * * [R] refcount on task - * [P] task->pi_lock held + * [Pn] task->pi_lock held * [L] rtmutex->wait_lock held * + * Normal locking order: + * + * rtmutex->wait_lock + * task->pi_lock + * * Step Description Protected by * function arguments: * @task [R] @@ -588,27 +625,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st * again: * loop_sanity_check(); * retry: - * [1] lock(task->pi_lock); [R] acquire [P] - * [2] waiter = task->pi_blocked_on; [P] - * [3] check_exit_conditions_1(); [P] - * [4] lock = waiter->lock; [P] - * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] - * unlock(task->pi_lock); release [P] + * [1] lock(task->pi_lock); [R] acquire [P1] + * [2] waiter = task->pi_blocked_on; [P1] + * [3] check_exit_conditions_1(); [P1] + * [4] lock = waiter->lock; [P1] + * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] + * unlock(task->pi_lock); release [P1] * goto retry; * } - * [6] check_exit_conditions_2(); [P] + [L] - * [7] requeue_lock_waiter(lock, waiter); [P] + [L] - * [8] unlock(task->pi_lock); release [P] + * [6] check_exit_conditions_2(); [P1] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] + * [8] unlock(task->pi_lock); release [P1] * put_task_struct(task); release [R] * [9] check_exit_conditions_3(); [L] * [10] task = owner(lock); [L] * get_task_struct(task); [L] acquire [R] - * lock(task->pi_lock); [L] acquire [P] - * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] - * [12] check_exit_conditions_4(); [P] + [L] - * [13] unlock(task->pi_lock); release [P] + * lock(task->pi_lock); [L] acquire [P2] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] + * [12] check_exit_conditions_4(); [P2] + [L] + * [13] unlock(task->pi_lock); release [P2] * unlock(lock->wait_lock); release [L] * goto again; + * + * Where P1 is the blocking task and P2 is the lock owner; going up one step + * the owner becomes the next blocked task etc.. + * +* */ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, enum rtmutex_chainwalk chwalk, @@ -756,7 +798,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -764,13 +806,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * [4] Get the next lock + * [4] Get the next lock; per holding task->pi_lock we can't unblock + * and guarantee @lock's existence. */ lock = waiter->lock; /* * [5] We need to trylock here as we are holding task->pi_lock, * which is the reverse lock order versus the other rtmutex * operations. + * + * Per the above, holding task->pi_lock guarantees lock exists, so + * inverting this lock order is infeasible from a life-time + * perspective. */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irq(&task->pi_lock); @@ -874,17 +921,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * or * * DL CBS enforcement advancing the effective deadline. - * - * Even though pi_waiters also uses these fields, and that tree is only - * updated in [11], we can do this here, since we hold [L], which - * serializes all pi_waiters access and rb_erase() does not care about - * the values of the node being removed. */ waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); - /* [8] Release the task */ + /* + * [8] Release the (blocking) task in preparation for + * taking the owner task in [10]. + * + * Since we hold lock->waiter_lock, task cannot unblock, even if we + * release task->pi_lock. + */ raw_spin_unlock(&task->pi_lock); put_task_struct(task); @@ -908,7 +956,12 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, return 0; } - /* [10] Grab the next task, i.e. the owner of @lock */ + /* + * [10] Grab the next task, i.e. the owner of @lock + * + * Per holding lock->wait_lock and checking for !owner above, there + * must be an owner and it cannot go away. + */ task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); @@ -921,8 +974,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * and adjust the priority of the owner. */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else if (prerequeue_top_waiter == waiter) { /* @@ -937,8 +991,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else { /* * Nothing changed. No need to do any priority @@ -1154,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, waiter->task = task; waiter->lock = lock; waiter_update_prio(waiter, task); + waiter_clone_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -1187,7 +1243,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1234,6 +1290,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, { struct rt_mutex_waiter *waiter; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1246,7 +1304,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_adjust_prio(current); + rt_mutex_adjust_prio(lock, current); /* * As we are waking up the top waiter, and the waiter stays @@ -1482,7 +1540,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index cb9fdff76a8a..a6974d044593 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index c47e8361bfb5..1162e07cdaea 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -17,27 +17,44 @@ #include #include + +/* + * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two + * separate trees and they need their own copy of the sort keys because of + * different locking requirements. + * + * @entry: rbtree node to enqueue into the waiters tree + * @prio: Priority of the waiter + * @deadline: Deadline of the waiter if applicable + * + * See rt_waiter_node_less() and waiter_*_prio(). + */ +struct rt_waiter_node { + struct rb_node entry; + int prio; + u64 deadline; +}; + /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @tree_entry: pi node to enqueue into the mutex waiters tree - * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree + * @tree: node to enqueue into the mutex waiters tree + * @pi_tree: node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task * @lock: Pointer to the rt_mutex on which the waiter blocks * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) - * @prio: Priority of the waiter - * @deadline: Deadline of the waiter if applicable * @ww_ctx: WW context pointer + * + * @tree is ordered by @lock->wait_lock + * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock */ struct rt_mutex_waiter { - struct rb_node tree_entry; - struct rb_node pi_tree_entry; + struct rt_waiter_node tree; + struct rt_waiter_node pi_tree; struct task_struct *task; struct rt_mutex_base *lock; unsigned int wake_state; - int prio; - u64 deadline; struct ww_acquire_ctx *ww_ctx; }; @@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, { struct rb_node *leftmost = rb_first_cached(&lock->waiters); - return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; + return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter; } static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) @@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base * struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; + lockdep_assert_held(&lock->wait_lock); + if (leftmost) { - w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry); BUG_ON(w->lock != lock); } return w; @@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p) static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) { + lockdep_assert_held(&p->pi_lock); + return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + pi_tree.entry); } #define RT_MUTEX_HAS_WAITERS 1UL @@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { debug_rt_mutex_init_waiter(waiter); - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); + RB_CLEAR_NODE(&waiter->pi_tree.entry); + RB_CLEAR_NODE(&waiter->tree.entry); waiter->wake_state = TASK_NORMAL; waiter->task = NULL; } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 56f139201f24..3ad2cc4823e5 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock) struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * __ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) { - struct rb_node *n = rb_next(&w->tree_entry); + struct rb_node *n = rb_next(&w->tree.entry); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) { - struct rb_node *n = rb_prev(&w->tree_entry); + struct rb_node *n = rb_prev(&w->tree.entry); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * @@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock) struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline void -- cgit v1.2.3 From 636e348353a7cc52609fdba5ff3270065da140d5 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 9 Jul 2023 01:33:44 +0200 Subject: prctl: move PR_GET_AUXV out of PR_MCE_KILL Somehow PR_GET_AUXV got added into PR_MCE_KILL's switch when the patch was applied [1]. Thus move it out of the switch, to the place the patch added it. In the recently released v6.4 kernel some user could, in principle, be already using this feature by mapping the right page and passing the PR_GET_AUXV constant as a pointer: prctl(PR_MCE_KILL, PR_GET_AUXV, ...) So this does change the behavior for users. We could keep the bug since the other subcases in PR_MCE_KILL (PR_MCE_KILL_CLEAR and PR_MCE_KILL_SET) do not overlap. However, v6.4 may be recent enough (2 weeks old) that moving the lines (rather than just adding a new case) does not break anybody? Moreover, the documentation in man-pages was just committed today [2]. Link: https://lkml.kernel.org/r/20230708233344.361854-1-ojeda@kernel.org Fixes: ddc65971bb67 ("prctl: add PR_GET_AUXV to copy auxv to userspace") Link: https://lore.kernel.org/all/d81864a7f7f43bca6afa2a09fc2e850e4050ab42.1680611394.git.josh@joshtriplett.org/ [1] Link: https://git.kernel.org/pub/scm/docs/man-pages/man-pages.git/commit/?id=8cf0c06bfd3c2b219b044d4151c96f0da50af9ad [2] Signed-off-by: Miguel Ojeda Cc: Josh Triplett Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 05f838929e72..2410e3999ebe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2535,11 +2535,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; - case PR_GET_AUXV: - if (arg4 || arg5) - return -EINVAL; - error = prctl_get_auxv((void __user *)arg2, arg3); - break; default: return -EINVAL; } @@ -2694,6 +2689,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; + case PR_GET_AUXV: + if (arg4 || arg5) + return -EINVAL; + error = prctl_get_auxv((void __user *)arg2, arg3); + break; #ifdef CONFIG_KSM case PR_SET_MEMORY_MERGE: if (arg3 || arg4 || arg5) -- cgit v1.2.3 From ba7b3e7d5f9014be65879ede8fd599cb222901c9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 17 Jul 2023 21:45:28 +0530 Subject: bpf: Fix subprog idx logic in check_max_stack_depth The assignment to idx in check_max_stack_depth happens once we see a bpf_pseudo_call or bpf_pseudo_func. This is not an issue as the rest of the code performs a few checks and then pushes the frame to the frame stack, except the case of async callbacks. If the async callback case causes the loop iteration to be skipped, the idx assignment will be incorrect on the next iteration of the loop. The value stored in the frame stack (as the subprogno of the current subprog) will be incorrect. This leads to incorrect checks and incorrect tail_call_reachable marking. Save the target subprog in a new variable and only assign to idx once we are done with the is_async_cb check which may skip pushing of frame to the frame stack and subsequent stack depth checks and tail call markings. Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230717161530.1238-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 930b5555cfd3..e682056dd144 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5621,7 +5621,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - int next_insn; + int next_insn, sidx; if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; @@ -5631,14 +5631,14 @@ continue_func: /* find the callee */ next_insn = i + insn[i].imm + 1; - idx = find_subprog(env, next_insn); - if (idx < 0) { + sidx = find_subprog(env, next_insn); + if (sidx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", next_insn); return -EFAULT; } - if (subprog[idx].is_async_cb) { - if (subprog[idx].has_tail_call) { + if (subprog[sidx].is_async_cb) { + if (subprog[sidx].has_tail_call) { verbose(env, "verifier bug. subprog has tail_call and async cb\n"); return -EFAULT; } @@ -5647,6 +5647,7 @@ continue_func: continue; } i = next_insn; + idx = sidx; if (subprog[idx].has_tail_call) tail_call_reachable = true; -- cgit v1.2.3 From b5e9ad522c4ccd32d322877515cff8d47ed731b9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 17 Jul 2023 21:45:29 +0530 Subject: bpf: Repeat check_max_stack_depth for async callbacks While the check_max_stack_depth function explores call chains emanating from the main prog, which is typically enough to cover all possible call chains, it doesn't explore those rooted at async callbacks unless the async callback will have been directly called, since unlike non-async callbacks it skips their instruction exploration as they don't contribute to stack depth. It could be the case that the async callback leads to a callchain which exceeds the stack depth, but this is never reachable while only exploring the entry point from main subprog. Hence, repeat the check for the main subprog *and* all async callbacks marked by the symbolic execution pass of the verifier, as execution of the program may begin at any of them. Consider functions with following stack depths: main: 256 async: 256 foo: 256 main: rX = async bpf_timer_set_callback(...) async: foo() Here, async is not descended as it does not contribute to stack depth of main (since it is referenced using bpf_pseudo_func and not bpf_pseudo_call). However, when async is invoked asynchronously, it will end up breaching the MAX_BPF_STACK limit by calling foo. Hence, in addition to main, we also need to explore call chains beginning at all async callback subprogs in a program. Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230717161530.1238-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e682056dd144..02a021c524ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5573,16 +5573,17 @@ static int update_stack_depth(struct bpf_verifier_env *env, * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ -static int check_max_stack_depth(struct bpf_verifier_env *env) +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) { - int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + int depth = 0, frame = 0, i, subprog_end; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j; + i = subprog[idx].start; process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack @@ -5683,6 +5684,22 @@ continue_func: goto continue_func; } +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *si = env->subprog_info; + int ret; + + for (int i = 0; i < env->subprog_cnt; i++) { + if (!i || si[i].is_async_cb) { + ret = check_max_stack_depth_subprog(env, i); + if (ret < 0) + return ret; + } + continue; + } + return 0; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON static int get_callee_stack_depth(struct bpf_verifier_env *env, const struct bpf_insn *insn, int idx) -- cgit v1.2.3 From 1faf7e4a0b6ff13d50c45b4b3469fc125536cf37 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 22 Jul 2023 11:21:23 +0800 Subject: tracing: Remove unused extern declaration tracing_map_set_field_descr() Since commit 08d43a5fa063 ("tracing: Add lock-free tracing_map"), this is never used, so can be removed. Link: https://lore.kernel.org/linux-trace-kernel/20230722032123.24664-1-yuehaibing@huawei.com Cc: Signed-off-by: YueHaibing Signed-off-by: Steven Rostedt (Google) --- kernel/trace/tracing_map.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 2c765ee2a4d4..99c37eeebc16 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -272,10 +272,6 @@ extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i); -extern void tracing_map_set_field_descr(struct tracing_map *map, - unsigned int i, - unsigned int key_offset, - tracing_map_cmp_fn_t cmp_fn); extern int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_key *sort_keys, -- cgit v1.2.3 From 8a96c0288d0737ad77882024974c075345c72011 Mon Sep 17 00:00:00 2001 From: Chen Lin Date: Wed, 19 Jul 2023 15:58:47 +0800 Subject: ring-buffer: Do not swap cpu_buffer during resize process When ring_buffer_swap_cpu was called during resize process, the cpu buffer was swapped in the middle, resulting in incorrect state. Continuing to run in the wrong state will result in oops. This issue can be easily reproduced using the following two scripts: /tmp # cat test1.sh //#! /bin/sh for i in `seq 0 100000` do echo 2000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 echo 5000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 done /tmp # cat test2.sh //#! /bin/sh for i in `seq 0 100000` do echo irqsoff > /sys/kernel/debug/tracing/current_tracer sleep 1 echo nop > /sys/kernel/debug/tracing/current_tracer sleep 1 done /tmp # ./test1.sh & /tmp # ./test2.sh & A typical oops log is as follows, sometimes with other different oops logs. [ 231.711293] WARNING: CPU: 0 PID: 9 at kernel/trace/ring_buffer.c:2026 rb_update_pages+0x378/0x3f8 [ 231.713375] Modules linked in: [ 231.714735] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 231.716750] Hardware name: linux,dummy-virt (DT) [ 231.718152] Workqueue: events update_pages_handler [ 231.719714] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 231.721171] pc : rb_update_pages+0x378/0x3f8 [ 231.722212] lr : rb_update_pages+0x25c/0x3f8 [ 231.723248] sp : ffff800082b9bd50 [ 231.724169] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 231.726102] x26: 0000000000000001 x25: fffffffffffff010 x24: 0000000000000ff0 [ 231.728122] x23: ffff0000c3a0b600 x22: ffff0000c3a0b5c0 x21: fffffffffffffe0a [ 231.730203] x20: ffff0000c3a0b600 x19: ffff0000c0102400 x18: 0000000000000000 [ 231.732329] x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffe7aa8510 [ 231.734212] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000002 [ 231.736291] x11: ffff8000826998a8 x10: ffff800082b9baf0 x9 : ffff800081137558 [ 231.738195] x8 : fffffc00030e82c8 x7 : 0000000000000000 x6 : 0000000000000001 [ 231.740192] x5 : ffff0000ffbafe00 x4 : 0000000000000000 x3 : 0000000000000000 [ 231.742118] x2 : 00000000000006aa x1 : 0000000000000001 x0 : ffff0000c0007208 [ 231.744196] Call trace: [ 231.744892] rb_update_pages+0x378/0x3f8 [ 231.745893] update_pages_handler+0x1c/0x38 [ 231.746893] process_one_work+0x1f0/0x468 [ 231.747852] worker_thread+0x54/0x410 [ 231.748737] kthread+0x124/0x138 [ 231.749549] ret_from_fork+0x10/0x20 [ 231.750434] ---[ end trace 0000000000000000 ]--- [ 233.720486] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 233.721696] Mem abort info: [ 233.721935] ESR = 0x0000000096000004 [ 233.722283] EC = 0x25: DABT (current EL), IL = 32 bits [ 233.722596] SET = 0, FnV = 0 [ 233.722805] EA = 0, S1PTW = 0 [ 233.723026] FSC = 0x04: level 0 translation fault [ 233.723458] Data abort info: [ 233.723734] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 233.724176] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 233.724589] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 233.725075] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000104943000 [ 233.725592] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 [ 233.726231] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 233.726720] Modules linked in: [ 233.727007] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 233.727777] Hardware name: linux,dummy-virt (DT) [ 233.728225] Workqueue: events update_pages_handler [ 233.728655] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 233.729054] pc : rb_update_pages+0x1a8/0x3f8 [ 233.729334] lr : rb_update_pages+0x154/0x3f8 [ 233.729592] sp : ffff800082b9bd50 [ 233.729792] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 233.730220] x26: 0000000000000000 x25: ffff800082a8b840 x24: ffff0000c0102418 [ 233.730653] x23: 0000000000000000 x22: fffffc000304c880 x21: 0000000000000003 [ 233.731105] x20: 00000000000001f4 x19: ffff0000c0102400 x18: ffff800082fcbc58 [ 233.731727] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000001 [ 233.732282] x14: ffff8000825fe0c8 x13: 0000000000000001 x12: 0000000000000000 [ 233.732709] x11: ffff8000826998a8 x10: 0000000000000ae0 x9 : ffff8000801b760c [ 233.733148] x8 : fefefefefefefeff x7 : 0000000000000018 x6 : ffff0000c03298c0 [ 233.733553] x5 : 0000000000000002 x4 : 0000000000000000 x3 : 0000000000000000 [ 233.733972] x2 : ffff0000c3a0b600 x1 : 0000000000000000 x0 : 0000000000000000 [ 233.734418] Call trace: [ 233.734593] rb_update_pages+0x1a8/0x3f8 [ 233.734853] update_pages_handler+0x1c/0x38 [ 233.735148] process_one_work+0x1f0/0x468 [ 233.735525] worker_thread+0x54/0x410 [ 233.735852] kthread+0x124/0x138 [ 233.736064] ret_from_fork+0x10/0x20 [ 233.736387] Code: 92400000 910006b5 aa000021 aa0303f7 (f9400060) [ 233.736959] ---[ end trace 0000000000000000 ]--- After analysis, the seq of the error is as follows [1-5]: int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //1. get cpu_buffer, aka cpu_buffer(A) ... ... schedule_work_on(cpu, &cpu_buffer->update_pages_work); //2. 'update_pages_work' is queue on 'cpu', cpu_buffer(A) is passed to // update_pages_handler, do the update process, set 'update_done' in // complete(&cpu_buffer->update_done) and to wakeup resize process. //----> //3. Just at this moment, ring_buffer_swap_cpu is triggered, //cpu_buffer(A) be swaped to cpu_buffer(B), the max_buffer. //ring_buffer_swap_cpu is called as the 'Call trace' below. Call trace: dump_backtrace+0x0/0x2f8 show_stack+0x18/0x28 dump_stack+0x12c/0x188 ring_buffer_swap_cpu+0x2f8/0x328 update_max_tr_single+0x180/0x210 check_critical_timing+0x2b4/0x2c8 tracer_hardirqs_on+0x1c0/0x200 trace_hardirqs_on+0xec/0x378 el0_svc_common+0x64/0x260 do_el0_svc+0x90/0xf8 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb8 el0_sync+0x180/0x1c0 //<---- /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //4. get cpu_buffer, cpu_buffer(B) is used in the following process, //the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong. //for example, cpu_buffer(A)->update_done will leave be set 1, and will //not 'wait_for_completion' at the next resize round. if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } ... } //5. the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong, //Continuing to run in the wrong state, then oops occurs. Link: https://lore.kernel.org/linux-trace-kernel/202307191558478409990@zte.com.cn Signed-off-by: Chen Lin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 14 +++++++++++++- kernel/trace/trace.c | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 14d8001140c8..de061dd47313 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -536,6 +536,7 @@ struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; + atomic_t resizing; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; @@ -2167,7 +2168,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); - + atomic_inc(&buffer->resizing); if (cpu_id == RING_BUFFER_ALL_CPUS) { /* @@ -2322,6 +2323,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, atomic_dec(&buffer->record_disabled); } + atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return 0; @@ -2342,6 +2344,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } } out_err_unlock: + atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return err; } @@ -5541,6 +5544,15 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (local_read(&cpu_buffer_b->committing)) goto out_dec; + /* + * When resize is in progress, we cannot swap it because + * it will mess the state of the cpu buffer. + */ + if (atomic_read(&buffer_a->resizing)) + goto out_dec; + if (atomic_read(&buffer_b->resizing)) + goto out_dec; + buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be847d45d81c..b8870078ef58 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1928,9 +1928,10 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * place on this CPU. We fail to record, but we reset * the max trace buffer (no one writes directly to it) * and flag that it failed. + * Another reason is resize is in progress. */ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, - "Failed to swap buffers due to commit in progress\n"); + "Failed to swap buffers due to commit or resize in progress\n"); } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); -- cgit v1.2.3 From 4b8b3905165ef98386a3c06f196c85d21292d029 Mon Sep 17 00:00:00 2001 From: Mohamed Khalfella Date: Fri, 14 Jul 2023 20:33:41 +0000 Subject: tracing/histograms: Return an error if we fail to add histogram to hist_vars list Commit 6018b585e8c6 ("tracing/histograms: Add histograms to hist_vars if they have referenced variables") added a check to fail histogram creation if save_hist_vars() failed to add histogram to hist_vars list. But the commit failed to set ret to failed return code before jumping to unregister histogram, fix it. Link: https://lore.kernel.org/linux-trace-kernel/20230714203341.51396-1-mkhalfella@purestorage.com Cc: stable@vger.kernel.org Fixes: 6018b585e8c6 ("tracing/histograms: Add histograms to hist_vars if they have referenced variables") Signed-off-by: Mohamed Khalfella Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c8c61381eba4..d06938ae0717 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6668,7 +6668,8 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_unreg; if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - if (save_hist_vars(hist_data)) + ret = save_hist_vars(hist_data); + if (ret) goto out_unreg; } -- cgit v1.2.3 From aa6fde93f3a49e42c0fe0490d7f3711bac0d162e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 17 Jul 2023 12:50:02 -1000 Subject: workqueue: Scale up wq_cpu_intensive_thresh_us if BogoMIPS is below 4000 wq_cpu_intensive_thresh_us is used to detect CPU-hogging per-cpu work items. Once detected, they're excluded from concurrency management to prevent them from blocking other per-cpu work items. If CONFIG_WQ_CPU_INTENSIVE_REPORT is enabled, repeat offenders are also reported so that the code can be updated. The default threshold is 10ms which is long enough to do fair bit of work on modern CPUs while short enough to be usually not noticeable. This unfortunately leads to a lot of, arguable spurious, detections on very slow CPUs. Using the same threshold across CPUs whose performance levels may be apart by multiple levels of magnitude doesn't make whole lot of sense. This patch scales up wq_cpu_intensive_thresh_us upto 1 second when BogoMIPS is below 4000. This is obviously very inaccurate but it doesn't have to be accurate to be useful. The mechanism is still useful when the threshold is fully scaled up and the benefits of reports are usually shared with everyone regardless of who's reporting, so as long as there are sufficient number of fast machines reporting, we don't lose much. Some (or is it all?) ARM CPUs systemtically report significantly lower BogoMIPS. While this doesn't break anything, given how widespread ARM CPUs are, it's at least a missed opportunity and it probably would be a good idea to teach workqueue about it. Signed-off-by: Tejun Heo Reported-and-Tested-by: Geert Uytterhoeven --- kernel/workqueue.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02a8f402eeb5..800b4208dba9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -338,8 +339,10 @@ static cpumask_var_t *wq_numa_possible_cpumask; * Per-cpu work items which run for longer than the following threshold are * automatically considered CPU intensive and excluded from concurrency * management to prevent them from noticeably delaying other per-cpu work items. + * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. + * The actual value is initialized in wq_cpu_intensive_thresh_init(). */ -static unsigned long wq_cpu_intensive_thresh_us = 10000; +static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); static bool wq_disable_numa; @@ -6513,6 +6516,42 @@ void __init workqueue_init_early(void) !system_freezable_power_efficient_wq); } +static void __init wq_cpu_intensive_thresh_init(void) +{ + unsigned long thresh; + unsigned long bogo; + + /* if the user set it to a specific value, keep it */ + if (wq_cpu_intensive_thresh_us != ULONG_MAX) + return; + + /* + * The default of 10ms is derived from the fact that most modern (as of + * 2023) processors can do a lot in 10ms and that it's just below what + * most consider human-perceivable. However, the kernel also runs on a + * lot slower CPUs including microcontrollers where the threshold is way + * too low. + * + * Let's scale up the threshold upto 1 second if BogoMips is below 4000. + * This is by no means accurate but it doesn't have to be. The mechanism + * is still useful even when the threshold is fully scaled up. Also, as + * the reports would usually be applicable to everyone, some machines + * operating on longer thresholds won't significantly diminish their + * usefulness. + */ + thresh = 10 * USEC_PER_MSEC; + + /* see init/calibrate.c for lpj -> BogoMIPS calculation */ + bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); + if (bogo < 4000) + thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); + + pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", + loops_per_jiffy, bogo, thresh); + + wq_cpu_intensive_thresh_us = thresh; +} + /** * workqueue_init - bring workqueue subsystem fully online * @@ -6528,6 +6567,8 @@ void __init workqueue_init(void) struct worker_pool *pool; int cpu, bkt; + wq_cpu_intensive_thresh_init(); + /* * It'd be simpler to initialize NUMA in workqueue_init_early() but * CPU to node mapping may not be available that early on some -- cgit v1.2.3 From f2c67a3e60d1071b65848efaa8c3b66c363dd025 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Jul 2023 10:42:05 +0200 Subject: bpf: Disable preemption in bpf_perf_event_output The nesting protection in bpf_perf_event_output relies on disabled preemption, which is guaranteed for kprobes and tracepoints. However bpf_perf_event_output can be also called from uprobes context through bpf_prog_run_array_sleepable function which disables migration, but keeps preemption enabled. This can cause task to be preempted by another one inside the nesting protection and lead eventually to two tasks using same perf_sample_data buffer and cause crashes like: kernel tried to execute NX-protected page - exploit attempt? (uid: 0) BUG: unable to handle page fault for address: ffffffff82be3eea ... Call Trace: ? __die+0x1f/0x70 ? page_fault_oops+0x176/0x4d0 ? exc_page_fault+0x132/0x230 ? asm_exc_page_fault+0x22/0x30 ? perf_output_sample+0x12b/0x910 ? perf_event_output+0xd0/0x1d0 ? bpf_perf_event_output+0x162/0x1d0 ? bpf_prog_c6271286d9a4c938_krava1+0x76/0x87 ? __uprobe_perf_func+0x12b/0x540 ? uprobe_dispatcher+0x2c4/0x430 ? uprobe_notify_resume+0x2da/0xce0 ? atomic_notifier_call_chain+0x7b/0x110 ? exit_to_user_mode_prepare+0x13e/0x290 ? irqentry_exit_to_user_mode+0x5/0x30 ? asm_exc_int3+0x35/0x40 Fixing this by disabling preemption in bpf_perf_event_output. Cc: stable@vger.kernel.org Fixes: 8c7dcb84e3b7 ("bpf: implement sleepable uprobes by chaining gps") Acked-by: Hou Tao Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20230725084206.580930-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5f2dcabad202..bf18a53731b0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,8 +661,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); - int nest_level = this_cpu_inc_return(bpf_trace_nest_level); + struct bpf_trace_sample_data *sds; struct perf_raw_record raw = { .frag = { .size = size, @@ -670,7 +669,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, }, }; struct perf_sample_data *sd; - int err; + int nest_level, err; + + preempt_disable(); + sds = this_cpu_ptr(&bpf_trace_sds); + nest_level = this_cpu_inc_return(bpf_trace_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { err = -EBUSY; @@ -688,9 +691,9 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_save_raw_data(sd, &raw); err = __bpf_perf_event_output(regs, map, flags, sd); - out: this_cpu_dec(bpf_trace_nest_level); + preempt_enable(); return err; } -- cgit v1.2.3 From d62cc390c2e99ae267ffe4b8d7e2e08b6c758c32 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Jul 2023 10:42:06 +0200 Subject: bpf: Disable preemption in bpf_event_output We received report [1] of kernel crash, which is caused by using nesting protection without disabled preemption. The bpf_event_output can be called by programs executed by bpf_prog_run_array_cg function that disabled migration but keeps preemption enabled. This can cause task to be preempted by another one inside the nesting protection and lead eventually to two tasks using same perf_sample_data buffer and cause crashes like: BUG: kernel NULL pointer dereference, address: 0000000000000001 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page ... ? perf_output_sample+0x12a/0x9a0 ? finish_task_switch.isra.0+0x81/0x280 ? perf_event_output+0x66/0xa0 ? bpf_event_output+0x13a/0x190 ? bpf_event_output_data+0x22/0x40 ? bpf_prog_dfc84bbde731b257_cil_sock4_connect+0x40a/0xacb ? xa_load+0x87/0xe0 ? __cgroup_bpf_run_filter_sock_addr+0xc1/0x1a0 ? release_sock+0x3e/0x90 ? sk_setsockopt+0x1a1/0x12f0 ? udp_pre_connect+0x36/0x50 ? inet_dgram_connect+0x93/0xa0 ? __sys_connect+0xb4/0xe0 ? udp_setsockopt+0x27/0x40 ? __pfx_udp_push_pending_frames+0x10/0x10 ? __sys_setsockopt+0xdf/0x1a0 ? __x64_sys_connect+0xf/0x20 ? do_syscall_64+0x3a/0x90 ? entry_SYSCALL_64_after_hwframe+0x72/0xdc Fixing this by disabling preemption in bpf_event_output. [1] https://github.com/cilium/cilium/issues/26756 Cc: stable@vger.kernel.org Reported-by: Oleg "livelace" Popov Closes: https://github.com/cilium/cilium/issues/26756 Fixes: 2a916f2f546c ("bpf: Use migrate_disable/enable in array macros and cgroup/lirc code.") Acked-by: Hou Tao Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20230725084206.580930-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index bf18a53731b0..bd1a42b23f3f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -718,7 +718,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -735,8 +734,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_sample_data *sd; struct pt_regs *regs; + int nest_level; u64 ret; + preempt_disable(); + nest_level = this_cpu_inc_return(bpf_event_output_nest_level); + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { ret = -EBUSY; goto out; @@ -751,6 +754,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, ret = __bpf_perf_event_output(regs, map, flags, sd); out: this_cpu_dec(bpf_event_output_nest_level); + preempt_enable(); return ret; } -- cgit v1.2.3 From 1f9f4f4777e7958e5c1fbdfd9ddf4207dc00a40f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 25 Jul 2023 10:37:19 +0900 Subject: tracing/probes: Fix to add NULL check for BTF APIs Since find_btf_func_param() abd btf_type_by_id() can return NULL, the caller must check the return value correctly. Link: https://lore.kernel.org/all/169024903951.395371.11361556840733470934.stgit@devnote2/ Fixes: b576e09701c7 ("tracing/probes: Support function parameters if BTF is available") Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b2b726bea1f9..c68a72707852 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -386,12 +386,12 @@ static const struct btf_type *find_btf_func_proto(const char *funcname) /* Get BTF_KIND_FUNC type */ t = btf_type_by_id(btf, id); - if (!btf_type_is_func(t)) + if (!t || !btf_type_is_func(t)) return ERR_PTR(-ENOENT); /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ t = btf_type_by_id(btf, t->type); - if (!btf_type_is_func_proto(t)) + if (!t || !btf_type_is_func_proto(t)) return ERR_PTR(-ENOENT); return t; @@ -443,7 +443,7 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, if (!ctx->params) { params = find_btf_func_param(ctx->funcname, &ctx->nr_params, ctx->flags & TPARG_FL_TPOINT); - if (IS_ERR(params)) { + if (IS_ERR_OR_NULL(params)) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); return PTR_ERR(params); } @@ -1273,7 +1273,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], params = find_btf_func_param(ctx->funcname, &nr_params, ctx->flags & TPARG_FL_TPOINT); - if (IS_ERR(params)) { + if (IS_ERR_OR_NULL(params)) { if (args_idx != -1) { /* $arg* requires BTF info */ trace_probe_log_err(0, NOSUP_BTFARG); -- cgit v1.2.3 From 5f0bc0b042fc77ff70e14c790abdec960cde4ec1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 25 Jul 2023 09:38:32 -0700 Subject: mm: suppress mm fault logging if fatal signal already pending Commit eda0047296a1 ("mm: make the page fault mmap locking killable") intentionally made it much easier to trigger the "page fault fails because a fatal signal is pending" situation, by having the mmap locking fail early in that case. We have long aborted page faults in other fatal cases when the actual IO for a page is interrupted by SIGKILL - which is particularly useful for the traditional case of NFS hanging due to network issues, but local filesystems could cause it too if you happened to get the SIGKILL while waiting for a page to be faulted in (eg lock_folio_maybe_drop_mmap()). So aborting the page fault wasn't a new condition - but it now triggers earlier, before we even get to 'handle_mm_fault()'. And as a result the error doesn't go through our 'fault_signal_pending()' logic, and doesn't get filtered away there. Normally you'd never even notice, because if a fatal signal is pending, the new SIGSEGV we send ends up being ignored anyway. But it turns out that there is one very noticeable exception: if you enable 'show_unhandled_signals', the aborted page fault will be logged in the kernel messages, and you'll get a scary line looking something like this in your logs: pverados[2183248]: segfault at 55e5a00f9ae0 ip 000055e5a00f9ae0 sp 00007ffc0720bea8 error 14 in perl[55e5a00d4000+195000] likely on CPU 10 (core 4, socket 0) which is rather misleading. It's not really a segfault at all, it's just "the thread was killed before the page fault completed, so we aborted the page fault". Fix this by just making it clear that a pending fatal signal means that any new signal coming in after that is implicitly handled. This will avoid the misleading logging, since now the signal isn't 'unhandled' any more. Reported-and-tested-by: Fiona Ebner Tested-by: Thomas Lamprecht Link: https://lore.kernel.org/lkml/8d063a26-43f5-0bb7-3203-c6a04dc159f8@proxmox.com/ Acked-by: Oleg Nesterov Fixes: eda0047296a1 ("mm: make the page fault mmap locking killable") Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b5370fe5c198..128e9bb3d1a2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -562,6 +562,10 @@ bool unhandled_signal(struct task_struct *tsk, int sig) if (handler != SIG_IGN && handler != SIG_DFL) return false; + /* If dying, we handle all new signals by ignoring them */ + if (fatal_signal_pending(tsk)) + return false; + /* if ptraced, let the tracer determine */ return !tsk->ptrace; } -- cgit v1.2.3 From 2d093282b0d4357373497f65db6a05eb0c28b7c8 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Mon, 24 Jul 2023 13:40:40 +0800 Subject: ring-buffer: Fix wrong stat of cpu_buffer->read When pages are removed in rb_remove_pages(), 'cpu_buffer->read' is set to 0 in order to make sure any read iterators reset themselves. However, this will mess 'entries' stating, see following steps: # cd /sys/kernel/tracing/ # 1. Enlarge ring buffer prepare for later reducing: # echo 20 > per_cpu/cpu0/buffer_size_kb # 2. Write a log into ring buffer of cpu0: # taskset -c 0 echo "hello1" > trace_marker # 3. Read the log: # cat per_cpu/cpu0/trace_pipe <...>-332 [000] ..... 62.406844: tracing_mark_write: hello1 # 4. Stop reading and see the stats, now 0 entries, and 1 event readed: # cat per_cpu/cpu0/stats entries: 0 [...] read events: 1 # 5. Reduce the ring buffer # echo 7 > per_cpu/cpu0/buffer_size_kb # 6. Now entries became unexpected 1 because actually no entries!!! # cat per_cpu/cpu0/stats entries: 1 [...] read events: 0 To fix it, introduce 'page_removed' field to count total removed pages since last reset, then use it to let read iterators reset themselves instead of changing the 'read' pointer. Link: https://lore.kernel.org/linux-trace-kernel/20230724054040.3489499-1-zhengyejian1@huawei.com Cc: Cc: Fixes: 83f40318dab0 ("ring-buffer: Make removal of ring buffer pages atomic") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index de061dd47313..46b4a3c7c3bf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -523,6 +523,8 @@ struct ring_buffer_per_cpu { rb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; + /* pages removed since last reset */ + unsigned long pages_removed; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -559,6 +561,7 @@ struct ring_buffer_iter { struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; + unsigned long cache_pages_removed; u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; @@ -1957,6 +1960,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) to_remove = rb_list_head(to_remove)->next; head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } + /* Read iterators need to reset themselves when some pages removed */ + cpu_buffer->pages_removed += nr_removed; next_page = rb_list_head(to_remove)->next; @@ -1978,12 +1983,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) cpu_buffer->head_page = list_entry(next_page, struct buffer_page, list); - /* - * change read pointer to make sure any read iterators reset - * themselves - */ - cpu_buffer->read = 0; - /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); @@ -4395,6 +4394,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; + iter->cache_pages_removed = cpu_buffer->pages_removed; if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; @@ -4849,12 +4849,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) buffer = cpu_buffer->buffer; /* - * Check if someone performed a consuming read to - * the buffer. A consuming read invalidates the iterator - * and we need to reset the iterator in this case. + * Check if someone performed a consuming read to the buffer + * or removed some pages from the buffer. In these cases, + * iterator was invalidated and we need to reset it. */ if (unlikely(iter->cache_read != cpu_buffer->read || - iter->cache_reader_page != cpu_buffer->reader_page)) + iter->cache_reader_page != cpu_buffer->reader_page || + iter->cache_pages_removed != cpu_buffer->pages_removed)) rb_iter_reset(iter); again: @@ -5298,6 +5299,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->last_overrun = 0; rb_head_page_activate(cpu_buffer); + cpu_buffer->pages_removed = 0; } /* Must have disabled the cpu buffer then done a synchronize_rcu */ -- cgit v1.2.3 From 151e34d1c6eb2f9b4b0b9d085d1fa9f02cc74f48 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 24 Jul 2023 22:08:24 +0800 Subject: ring-buffer: Fix kernel-doc warnings in ring_buffer.c Fix kernel-doc warnings: kernel/trace/ring_buffer.c:954: warning: Function parameter or member 'cpu' not described in 'ring_buffer_wake_waiters' kernel/trace/ring_buffer.c:3383: warning: Excess function parameter 'event' description in 'ring_buffer_unlock_commit' kernel/trace/ring_buffer.c:5359: warning: Excess function parameter 'cpu' description in 'ring_buffer_reset_online_cpus' Link: https://lkml.kernel.org/r/20230724140827.1023266-2-cuigaosheng1@huawei.com Cc: Signed-off-by: Gaosheng Cui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 46b4a3c7c3bf..52dea5dd5362 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -950,6 +950,7 @@ static void rb_wake_up_waiters(struct irq_work *work) /** * ring_buffer_wake_waiters - wake up any waiters on this ring buffer * @buffer: The ring buffer to wake waiters on + * @cpu: The CPU buffer to wake waiters on * * In the case of a file that represents a ring buffer is closing, * it is prudent to wake up any waiters that are on this. @@ -3375,7 +3376,6 @@ void ring_buffer_nest_end(struct trace_buffer *buffer) /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to - * @event: The event pointer to commit. * * This commits the data to the ring buffer, and releases any locks held. * @@ -5358,7 +5358,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of - * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { -- cgit v1.2.3 From b32c789f7dbbba3e55f6295ad64923cb69e6f563 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 24 Jul 2023 22:08:25 +0800 Subject: tracing/synthetic: Fix kernel-doc warnings in trace_events_synth.c Fix kernel-doc warning: kernel/trace/trace_events_synth.c:1257: warning: Function parameter or member 'mod' not described in 'synth_event_gen_cmd_array_start' Link: https://lkml.kernel.org/r/20230724140827.1023266-3-cuigaosheng1@huawei.com Cc: Signed-off-by: Gaosheng Cui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index d6a70aff2410..dd398afc8e25 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1230,6 +1230,7 @@ EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); * synth_event_gen_cmd_array_start - Start synthetic event command from an array * @cmd: A pointer to the dynevent_cmd struct representing the new event * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module * @fields: An array of type/name field descriptions * @n_fields: The number of field descriptions contained in the fields array * -- cgit v1.2.3 From bd7217f30c7f08fc20f745aa1e2174eb3ff6d82a Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 24 Jul 2023 22:08:26 +0800 Subject: tracing: Fix kernel-doc warnings in trace_events_trigger.c Fix kernel-doc warnings: kernel/trace/trace_events_trigger.c:59: warning: Function parameter or member 'buffer' not described in 'event_triggers_call' kernel/trace/trace_events_trigger.c:59: warning: Function parameter or member 'event' not described in 'event_triggers_call' Link: https://lkml.kernel.org/r/20230724140827.1023266-4-cuigaosheng1@huawei.com Cc: Signed-off-by: Gaosheng Cui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e535959939d3..46439e3bcec4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -31,7 +31,9 @@ void trigger_data_free(struct event_trigger_data *data) /** * event_triggers_call - Call triggers associated with a trace event * @file: The trace_event_file associated with the event + * @buffer: The ring buffer that the event is being written to * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command. If rec is -- cgit v1.2.3 From 6c95d71bad61410e3717afcf7fd1837a8e03edf2 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 24 Jul 2023 22:08:27 +0800 Subject: tracing: Fix kernel-doc warnings in trace_seq.c Fix kernel-doc warning: kernel/trace/trace_seq.c:142: warning: Function parameter or member 'args' not described in 'trace_seq_vprintf' Link: https://lkml.kernel.org/r/20230724140827.1023266-5-cuigaosheng1@huawei.com Cc: Signed-off-by: Gaosheng Cui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_seq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index e5e299260d0c..bac06ee3b98b 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string + * @args: Arguments for the format string * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formatting of a trace -- cgit v1.2.3 From dea499781a1150d285c62b26659f62fb00824fce Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 26 Jul 2023 17:58:04 +0800 Subject: tracing: Fix warning in trace_buffered_event_disable() Warning happened in trace_buffered_event_disable() at WARN_ON_ONCE(!trace_buffered_event_ref) Call Trace: ? __warn+0xa5/0x1b0 ? trace_buffered_event_disable+0x189/0x1b0 __ftrace_event_enable_disable+0x19e/0x3e0 free_probe_data+0x3b/0xa0 unregister_ftrace_function_probe_func+0x6b8/0x800 event_enable_func+0x2f0/0x3d0 ftrace_process_regex.isra.0+0x12d/0x1b0 ftrace_filter_write+0xe6/0x140 vfs_write+0x1c9/0x6f0 [...] The cause of the warning is in __ftrace_event_enable_disable(), trace_buffered_event_enable() was called once while trace_buffered_event_disable() was called twice. Reproduction script show as below, for analysis, see the comments: ``` #!/bin/bash cd /sys/kernel/tracing/ # 1. Register a 'disable_event' command, then: # 1) SOFT_DISABLED_BIT was set; # 2) trace_buffered_event_enable() was called first time; echo 'cmdline_proc_show:disable_event:initcall:initcall_finish' > \ set_ftrace_filter # 2. Enable the event registered, then: # 1) SOFT_DISABLED_BIT was cleared; # 2) trace_buffered_event_disable() was called first time; echo 1 > events/initcall/initcall_finish/enable # 3. Try to call into cmdline_proc_show(), then SOFT_DISABLED_BIT was # set again!!! cat /proc/cmdline # 4. Unregister the 'disable_event' command, then: # 1) SOFT_DISABLED_BIT was cleared again; # 2) trace_buffered_event_disable() was called second time!!! echo '!cmdline_proc_show:disable_event:initcall:initcall_finish' > \ set_ftrace_filter ``` To fix it, IIUC, we can change to call trace_buffered_event_enable() at fist time soft-mode enabled, and call trace_buffered_event_disable() at last time soft-mode disabled. Link: https://lore.kernel.org/linux-trace-kernel/20230726095804.920457-1-zhengyejian1@huawei.com Cc: Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5d6ae4eae510..578f1f7d49a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -611,7 +611,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; - unsigned long file_flags = file->flags; int ret = 0; int disable; @@ -635,6 +634,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Disable use of trace_buffered_event */ + trace_buffered_event_disable(); } else disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); @@ -673,6 +674,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (atomic_inc_return(&file->sm_ref) > 1) break; set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Enable use of trace_buffered_event */ + trace_buffered_event_enable(); } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { @@ -712,15 +715,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; } - /* Enable or disable use of trace_buffered_event */ - if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != - (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { - if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) - trace_buffered_event_enable(); - else - trace_buffered_event_disable(); - } - return ret; } -- cgit v1.2.3 From de02f2ac5d8cfb311f44f2bf144cc20002f1fbbd Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 10:50:47 +0900 Subject: kprobes: Prohibit probing on CFI preamble symbol Do not allow to probe on "__cfi_" or "__pfx_" started symbol, because those are used for CFI and not executed. Probing it will break the CFI. Link: https://lore.kernel.org/all/168904024679.116016.18089228029322008512.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/kprobes.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fc6095d502d..ca385b61d546 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1545,6 +1545,17 @@ static int check_ftrace_location(struct kprobe *p) return 0; } +static bool is_cfi_preamble_symbol(unsigned long addr) +{ + char symbuf[KSYM_NAME_LEN]; + + if (lookup_symbol_name(addr, symbuf)) + return false; + + return str_has_prefix("__cfi_", symbuf) || + str_has_prefix("__pfx_", symbuf); +} + static int check_kprobe_address_safe(struct kprobe *p, struct module **probed_mod) { @@ -1563,7 +1574,8 @@ static int check_kprobe_address_safe(struct kprobe *p, within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || - find_bug((unsigned long)p->addr)) { + find_bug((unsigned long)p->addr) || + is_cfi_preamble_symbol((unsigned long)p->addr)) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From 640a604585aa30f93e39b17d4d6ba69fcb1e66c9 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 29 Jul 2023 17:51:06 +0800 Subject: bpf, cpumap: Make sure kthread is running before map update returns The following warning was reported when running stress-mode enabled xdp_redirect_cpu with some RT threads: ------------[ cut here ]------------ WARNING: CPU: 4 PID: 65 at kernel/bpf/cpumap.c:135 CPU: 4 PID: 65 Comm: kworker/4:1 Not tainted 6.5.0-rc2+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Workqueue: events cpu_map_kthread_stop RIP: 0010:put_cpu_map_entry+0xda/0x220 ...... Call Trace: ? show_regs+0x65/0x70 ? __warn+0xa5/0x240 ...... ? put_cpu_map_entry+0xda/0x220 cpu_map_kthread_stop+0x41/0x60 process_one_work+0x6b0/0xb80 worker_thread+0x96/0x720 kthread+0x1a5/0x1f0 ret_from_fork+0x3a/0x70 ret_from_fork_asm+0x1b/0x30 The root cause is the same as commit 436901649731 ("bpf: cpumap: Fix memory leak in cpu_map_update_elem"). The kthread is stopped prematurely by kthread_stop() in cpu_map_kthread_stop(), and kthread() doesn't call cpu_map_kthread_run() at all but XDP program has already queued some frames or skbs into ptr_ring. So when __cpu_map_ring_cleanup() checks the ptr_ring, it will find it was not emptied and report a warning. An alternative fix is to use __cpu_map_ring_cleanup() to drop these pending frames or skbs when kthread_stop() returns -EINTR, but it may confuse the user, because these frames or skbs have been handled correctly by XDP program. So instead of dropping these frames or skbs, just make sure the per-cpu kthread is running before __cpu_map_entry_alloc() returns. After apply the fix, the error handle for kthread_stop() will be unnecessary because it will always return 0, so just remove it. Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP") Signed-off-by: Hou Tao Reviewed-by: Pu Lehui Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20230729095107.1722450-2-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/cpumap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 6ae02be7a48e..7eeb20025164 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -73,6 +74,7 @@ struct bpf_cpu_map_entry { struct rcu_head rcu; struct work_struct kthread_stop_wq; + struct completion kthread_running; }; struct bpf_cpu_map { @@ -153,7 +155,6 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) static void cpu_map_kthread_stop(struct work_struct *work) { struct bpf_cpu_map_entry *rcpu; - int err; rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); @@ -163,14 +164,7 @@ static void cpu_map_kthread_stop(struct work_struct *work) rcu_barrier(); /* kthread_stop will wake_up_process and wait for it to complete */ - err = kthread_stop(rcpu->kthread); - if (err) { - /* kthread_stop may be called before cpu_map_kthread_run - * is executed, so we need to release the memory related - * to rcpu. - */ - put_cpu_map_entry(rcpu); - } + kthread_stop(rcpu->kthread); } static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, @@ -298,11 +292,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, return nframes; } - static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; + complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); /* When kthread gives stop order, then rcpu have been disconnected @@ -467,6 +461,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, goto free_ptr_ring; /* Setup kthread */ + init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); @@ -480,6 +475,12 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); + /* Make sure kthread has been running, so kthread_stop() will not + * stop the kthread prematurely and all pending frames or skbs + * will be handled by the kthread before kthread_stop() returns. + */ + wait_for_completion(&rcpu->kthread_running); + return rcpu; free_prog: -- cgit v1.2.3 From 7c62b75cd1a792e14b037fa4f61f9b18914e7de1 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 29 Jul 2023 17:51:07 +0800 Subject: bpf, cpumap: Handle skb as well when clean up ptr_ring The following warning was reported when running xdp_redirect_cpu with both skb-mode and stress-mode enabled: ------------[ cut here ]------------ Incorrect XDP memory type (-2128176192) usage WARNING: CPU: 7 PID: 1442 at net/core/xdp.c:405 Modules linked in: CPU: 7 PID: 1442 Comm: kworker/7:0 Tainted: G 6.5.0-rc2+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Workqueue: events __cpu_map_entry_free RIP: 0010:__xdp_return+0x1e4/0x4a0 ...... Call Trace: ? show_regs+0x65/0x70 ? __warn+0xa5/0x240 ? __xdp_return+0x1e4/0x4a0 ...... xdp_return_frame+0x4d/0x150 __cpu_map_entry_free+0xf9/0x230 process_one_work+0x6b0/0xb80 worker_thread+0x96/0x720 kthread+0x1a5/0x1f0 ret_from_fork+0x3a/0x70 ret_from_fork_asm+0x1b/0x30 The reason for the warning is twofold. One is due to the kthread cpu_map_kthread_run() is stopped prematurely. Another one is __cpu_map_ring_cleanup() doesn't handle skb mode and treats skbs in ptr_ring as XDP frames. Prematurely-stopped kthread will be fixed by the preceding patch and ptr_ring will be empty when __cpu_map_ring_cleanup() is called. But as the comments in __cpu_map_ring_cleanup() said, handling and freeing skbs in ptr_ring as well to "catch any broken behaviour gracefully". Fixes: 11941f8a8536 ("bpf: cpumap: Implement generic cpumap") Signed-off-by: Hou Tao Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20230729095107.1722450-3-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/cpumap.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 7eeb20025164..286ab3db0fde 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -131,11 +131,17 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ - struct xdp_frame *xdpf; + void *ptr; - while ((xdpf = ptr_ring_consume(ring))) - if (WARN_ON_ONCE(xdpf)) - xdp_return_frame(xdpf); + while ((ptr = ptr_ring_consume(ring))) { + WARN_ON_ONCE(1); + if (unlikely(__ptr_test_bit(0, &ptr))) { + __ptr_clear_bit(0, &ptr); + kfree_skb(ptr); + continue; + } + xdp_return_frame(ptr); + } } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -- cgit v1.2.3 From df2f7cde73cb58c0e6a60f97d1cd6037138a45cd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 7 Aug 2023 10:33:57 +0200 Subject: PM: hibernate: fix resume_store() return value when hibernation not available On a laptop with hibernation set up but not actively used, and with secure boot and lockdown enabled kernel, 6.5-rc1 gets stuck on boot with the following repeated messages: A start job is running for Resume from hibernation using device /dev/system/swap (24s / no limit) lockdown_is_locked_down: 25311154 callbacks suppressed Lockdown: systemd-hiberna: hibernation is restricted; see man kernel_lockdown.7 ... Checking the resume code leads to commit cc89c63e2fe3 ("PM: hibernate: move finding the resume device out of software_resume") which inadvertently changed the return value from resume_store() to 0 when !hibernation_available(). This apparently translates to userspace write() returning 0 as in number of bytes written, and userspace looping indefinitely in the attempt to write the intended value. Fix this by returning the full number of bytes that were to be written, as that's what was done before the commit. Fixes: cc89c63e2fe3 ("PM: hibernate: move finding the resume device out of software_resume") Signed-off-by: Vlastimil Babka Reviewed-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e1b4bfa938dd..2b4a946a6ff5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1166,7 +1166,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, int error; if (!hibernation_available()) - return 0; + return n; if (len && buf[len-1] == '\n') len--; -- cgit v1.2.3 From b71645d6af10196c46cbe3732de2ea7d36b3ff6d Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Sat, 5 Aug 2023 11:38:15 +0800 Subject: tracing: Fix cpu buffers unavailable due to 'record_disabled' missed Trace ring buffer can no longer record anything after executing following commands at the shell prompt: # cd /sys/kernel/tracing # cat tracing_cpumask fff # echo 0 > tracing_cpumask # echo 1 > snapshot # echo fff > tracing_cpumask # echo 1 > tracing_on # echo "hello world" > trace_marker -bash: echo: write error: Bad file descriptor The root cause is that: 1. After `echo 0 > tracing_cpumask`, 'record_disabled' of cpu buffers in 'tr->array_buffer.buffer' became 1 (see tracing_set_cpumask()); 2. After `echo 1 > snapshot`, 'tr->array_buffer.buffer' is swapped with 'tr->max_buffer.buffer', then the 'record_disabled' became 0 (see update_max_tr()); 3. After `echo fff > tracing_cpumask`, the 'record_disabled' become -1; Then array_buffer and max_buffer are both unavailable due to value of 'record_disabled' is not 0. To fix it, enable or disable both array_buffer and max_buffer at the same time in tracing_set_cpumask(). Link: https://lkml.kernel.org/r/20230805033816.3284594-2-zhengyejian1@huawei.com Cc: Cc: Cc: Fixes: 71babb2705e2 ("tracing: change CPU ring buffer state from tracing_cpumask") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b8870078ef58..b0e8eb6ea8ac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5277,11 +5277,17 @@ int tracing_set_cpumask(struct trace_array *tr, !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); +#endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); +#endif } } arch_spin_unlock(&tr->max_lock); -- cgit v1.2.3 From ddeea494a16f32522bce16ee65f191d05d4b8282 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 16 Aug 2023 17:49:26 +0200 Subject: tracing/synthetic: Use union instead of casts The current code uses a lot of casts to access the fields member in struct synth_trace_events with different sizes. This makes the code hard to read, and had already introduced an endianness bug. Use a union and struct instead. Link: https://lkml.kernel.org/r/20230816154928.4171614-2-svens@linux.ibm.com Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Fixes: 00cf3d672a9dd ("tracing: Allow synthetic events to pass around stacktraces") Signed-off-by: Sven Schnelle Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 8 ++++ kernel/trace/trace_events_synth.c | 87 +++++++++++++++++---------------------- 2 files changed, 45 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e1edc2197fc8..95956f75bea5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1295,6 +1295,14 @@ static inline void trace_branch_disable(void) /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); +union trace_synth_field { + u8 as_u8; + u16 as_u16; + u32 as_u32; + u64 as_u64; + struct trace_dynamic_info as_dynamic; +}; + struct ftrace_event_field { struct list_head link; const char *name; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index dd398afc8e25..7fff8235075f 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -127,7 +127,7 @@ static bool synth_event_match(const char *system, const char *event, struct synth_trace_event { struct trace_entry ent; - u64 fields[]; + union trace_synth_field fields[]; }; static int synth_event_define_fields(struct trace_event_call *call) @@ -321,19 +321,19 @@ static const char *synth_field_fmt(char *type) static void print_synth_event_num_val(struct trace_seq *s, char *print_fmt, char *name, - int size, u64 val, char *space) + int size, union trace_synth_field *val, char *space) { switch (size) { case 1: - trace_seq_printf(s, print_fmt, name, (u8)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u8, space); break; case 2: - trace_seq_printf(s, print_fmt, name, (u16)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u16, space); break; case 4: - trace_seq_printf(s, print_fmt, name, (u32)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u32, space); break; default: @@ -374,36 +374,26 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, /* parameter values */ if (se->fields[i]->is_string) { if (se->fields[i]->is_dynamic) { - u32 offset, data_offset; - char *str_field; - - offset = (u32)entry->fields[n_u64]; - data_offset = offset & 0xffff; - - str_field = (char *)entry + data_offset; + union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, STR_VAR_LEN_MAX, - str_field, + (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, STR_VAR_LEN_MAX, - (char *)&entry->fields[n_u64], + (char *)&entry->fields[n_u64].as_u64, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } } else if (se->fields[i]->is_stack) { - u32 offset, data_offset, len; unsigned long *p, *end; + union trace_synth_field *data = &entry->fields[n_u64]; - offset = (u32)entry->fields[n_u64]; - data_offset = offset & 0xffff; - len = offset >> 16; - - p = (void *)entry + data_offset; - end = (void *)p + len - (sizeof(long) - 1); + p = (void *)entry + data->as_dynamic.offset; + end = (void *)p + data->as_dynamic.len - (sizeof(long) - 1); trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); @@ -419,13 +409,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, print_synth_event_num_val(s, print_fmt, se->fields[i]->name, se->fields[i]->size, - entry->fields[n_u64], + &entry->fields[n_u64], space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", - entry->fields[n_u64], + entry->fields[n_u64].as_u64, __flags); trace_seq_putc(s, ')'); } @@ -454,21 +444,16 @@ static unsigned int trace_string(struct synth_trace_event *entry, int ret; if (is_dynamic) { - u32 data_offset; + union trace_synth_field *data = &entry->fields[*n_u64]; - data_offset = struct_size(entry, fields, event->n_u64); - data_offset += data_size; - - len = fetch_store_strlen((unsigned long)str_val); - - data_offset |= len << 16; - *(u32 *)&entry->fields[*n_u64] = data_offset; + data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size; + data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val); ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); (*n_u64)++; } else { - str_field = (char *)&entry->fields[*n_u64]; + str_field = (char *)&entry->fields[*n_u64].as_u64; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)str_val < TASK_SIZE) @@ -492,6 +477,7 @@ static unsigned int trace_stack(struct synth_trace_event *entry, unsigned int data_size, unsigned int *n_u64) { + union trace_synth_field *data = &entry->fields[*n_u64]; unsigned int len; u32 data_offset; void *data_loc; @@ -515,8 +501,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry, memcpy(data_loc, stack, len); /* Fill in the field that holds the offset/len combo */ - data_offset |= len << 16; - *(u32 *)&entry->fields[*n_u64] = data_offset; + + data->as_dynamic.offset = data_offset; + data->as_dynamic.len = len; (*n_u64)++; @@ -592,19 +579,19 @@ static notrace void trace_event_raw_event_synth(void *__data, switch (field->size) { case 1: - *(u8 *)&entry->fields[n_u64] = (u8)val; + entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&entry->fields[n_u64] = (u16)val; + entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&entry->fields[n_u64] = (u32)val; + entry->fields[n_u64].as_u32 = (u32)val; break; default: - entry->fields[n_u64] = val; + entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -1791,19 +1778,19 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) switch (field->size) { case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; + state.entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; + state.entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; + state.entry->fields[n_u64].as_u32 = (u32)val; break; default: - state.entry->fields[n_u64] = val; + state.entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -1884,19 +1871,19 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, switch (field->size) { case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; + state.entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; + state.entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; + state.entry->fields[n_u64].as_u32 = (u32)val; break; default: - state.entry->fields[n_u64] = val; + state.entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -2031,19 +2018,19 @@ static int __synth_event_add_val(const char *field_name, u64 val, } else { switch (field->size) { case 1: - *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + trace_state->entry->fields[field->offset].as_u8 = (u8)val; break; case 2: - *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + trace_state->entry->fields[field->offset].as_u16 = (u16)val; break; case 4: - *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + trace_state->entry->fields[field->offset].as_u32 = (u32)val; break; default: - trace_state->entry->fields[field->offset] = val; + trace_state->entry->fields[field->offset].as_u64 = val; break; } } -- cgit v1.2.3 From 887f92e09ef34a949745ad26ce82be69e2dabcf6 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 16 Aug 2023 17:49:27 +0200 Subject: tracing/synthetic: Skip first entry for stack traces While debugging another issue I noticed that the stack trace output contains the number of entries on top: -0 [000] d..4. 203.322502: wake_lat: pid=0 delta=2268270616 stack=STACK: => 0x10 => __schedule+0xac6/0x1a98 => schedule+0x126/0x2c0 => schedule_timeout+0x242/0x2c0 => __wait_for_common+0x434/0x680 => __wait_rcu_gp+0x198/0x3e0 => synchronize_rcu+0x112/0x138 => ring_buffer_reset_online_cpus+0x140/0x2e0 => tracing_reset_online_cpus+0x15c/0x1d0 => tracing_set_clock+0x180/0x1d8 => hist_register_trigger+0x486/0x670 => event_hist_trigger_parse+0x494/0x1318 => trigger_process_regex+0x1d4/0x258 => event_trigger_write+0xb4/0x170 => vfs_write+0x210/0xad0 => ksys_write+0x122/0x208 Fix this by skipping the first element. Also replace the pointer logic with an index variable which is easier to read. Link: https://lkml.kernel.org/r/20230816154928.4171614-3-svens@linux.ibm.com Cc: Masami Hiramatsu Fixes: 00cf3d672a9d ("tracing: Allow synthetic events to pass around stacktraces") Signed-off-by: Sven Schnelle Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 7fff8235075f..80a2a832f857 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -350,7 +350,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; struct synth_trace_event *entry; struct synth_event *se; - unsigned int i, n_u64; + unsigned int i, j, n_u64; char print_fmt[32]; const char *fmt; @@ -389,18 +389,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } } else if (se->fields[i]->is_stack) { - unsigned long *p, *end; union trace_synth_field *data = &entry->fields[n_u64]; - - p = (void *)entry + data->as_dynamic.offset; - end = (void *)p + data->as_dynamic.len - (sizeof(long) - 1); + unsigned long *p = (void *)entry + data->as_dynamic.offset; trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); - - for (; *p && p < end; p++) - trace_seq_printf(s, "=> %pS\n", (void *)*p); + for (j = 1; j < data->as_dynamic.len / sizeof(long); j++) + trace_seq_printf(s, "=> %pS\n", (void *)p[j]); n_u64++; - } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -490,10 +485,6 @@ static unsigned int trace_stack(struct synth_trace_event *entry, break; } - /* Include the zero'd element if it fits */ - if (len < HIST_STACKTRACE_DEPTH) - len++; - len *= sizeof(long); /* Find the dynamic section to copy the stack into. */ -- cgit v1.2.3 From c4d6b5438116c184027b2e911c0f2c7c406fb47c Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 16 Aug 2023 17:49:28 +0200 Subject: tracing/synthetic: Allocate one additional element for size While debugging another issue I noticed that the stack trace contains one invalid entry at the end: -0 [008] d..4. 26.484201: wake_lat: pid=0 delta=2629976084 000000009cc24024 stack=STACK: => __schedule+0xac6/0x1a98 => schedule+0x126/0x2c0 => schedule_timeout+0x150/0x2c0 => kcompactd+0x9ca/0xc20 => kthread+0x2f6/0x3d8 => __ret_from_fork+0x8a/0xe8 => 0x6b6b6b6b6b6b6b6b This is because the code failed to add the one element containing the number of entries to field_size. Link: https://lkml.kernel.org/r/20230816154928.4171614-4-svens@linux.ibm.com Cc: Masami Hiramatsu Fixes: 00cf3d672a9d ("tracing: Allow synthetic events to pass around stacktraces") Signed-off-by: Sven Schnelle Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 80a2a832f857..9897d0bfcab7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -528,7 +528,8 @@ static notrace void trace_event_raw_event_synth(void *__data, str_val = (char *)(long)var_ref_vals[val_idx]; if (event->dynamic_fields[i]->is_stack) { - len = *((unsigned long *)str_val); + /* reserve one extra element for size */ + len = *((unsigned long *)str_val) + 1; len *= sizeof(unsigned long); } else { len = fetch_store_strlen((unsigned long)str_val); -- cgit v1.2.3 From eecb91b9f98d6427d4af5fdb8f108f52572a39e7 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 17 Aug 2023 20:55:39 +0800 Subject: tracing: Fix memleak due to race between current_tracer and trace Kmemleak report a leak in graph_trace_open(): unreferenced object 0xffff0040b95f4a00 (size 128): comm "cat", pid 204981, jiffies 4301155872 (age 99771.964s) hex dump (first 32 bytes): e0 05 e7 b4 ab 7d 00 00 0b 00 01 00 00 00 00 00 .....}.......... f4 00 01 10 00 a0 ff ff 00 00 00 00 65 00 10 00 ............e... backtrace: [<000000005db27c8b>] kmem_cache_alloc_trace+0x348/0x5f0 [<000000007df90faa>] graph_trace_open+0xb0/0x344 [<00000000737524cd>] __tracing_open+0x450/0xb10 [<0000000098043327>] tracing_open+0x1a0/0x2a0 [<00000000291c3876>] do_dentry_open+0x3c0/0xdc0 [<000000004015bcd6>] vfs_open+0x98/0xd0 [<000000002b5f60c9>] do_open+0x520/0x8d0 [<00000000376c7820>] path_openat+0x1c0/0x3e0 [<00000000336a54b5>] do_filp_open+0x14c/0x324 [<000000002802df13>] do_sys_openat2+0x2c4/0x530 [<0000000094eea458>] __arm64_sys_openat+0x130/0x1c4 [<00000000a71d7881>] el0_svc_common.constprop.0+0xfc/0x394 [<00000000313647bf>] do_el0_svc+0xac/0xec [<000000002ef1c651>] el0_svc+0x20/0x30 [<000000002fd4692a>] el0_sync_handler+0xb0/0xb4 [<000000000c309c35>] el0_sync+0x160/0x180 The root cause is descripted as follows: __tracing_open() { // 1. File 'trace' is being opened; ... *iter->trace = *tr->current_trace; // 2. Tracer 'function_graph' is // currently set; ... iter->trace->open(iter); // 3. Call graph_trace_open() here, // and memory are allocated in it; ... } s_start() { // 4. The opened file is being read; ... *iter->trace = *tr->current_trace; // 5. If tracer is switched to // 'nop' or others, then memory // in step 3 are leaked!!! ... } To fix it, in s_start(), close tracer before switching then reopen the new tracer after switching. And some tracers like 'wakeup' may not update 'iter->private' in some cases when reopen, then it should be cleared to avoid being mistakenly closed again. Link: https://lore.kernel.org/linux-trace-kernel/20230817125539.1646321-1-zhengyejian1@huawei.com Fixes: d7350c3f4569 ("tracing/core: make the read callbacks reentrants") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 9 ++++++++- kernel/trace/trace_irqsoff.c | 3 ++- kernel/trace/trace_sched_wakeup.c | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b0e8eb6ea8ac..29a2e4d7886d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4213,8 +4213,15 @@ static void *s_start(struct seq_file *m, loff_t *pos) * will point to the same string as current_trace->name. */ mutex_lock(&trace_types_lock); - if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) + if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) { + /* Close iter->trace before switching to the new current tracer */ + if (iter->trace->close) + iter->trace->close(iter); *iter->trace = *tr->current_trace; + /* Reopen the new current tracer */ + if (iter->trace->open) + iter->trace->open(iter); + } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 590b3d51afae..ba37f768e2f2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -231,7 +231,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - + else + iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 330aee1c1a49..0469a04a355f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -168,6 +168,8 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); + else + iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) -- cgit v1.2.3 From c2489bb7e6be2e8cdced12c16c42fa128403ac03 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Fri, 18 Aug 2023 10:26:45 +0800 Subject: tracing: Introduce pipe_cpumask to avoid race on trace_pipes There is race issue when concurrently splice_read main trace_pipe and per_cpu trace_pipes which will result in data read out being different from what actually writen. As suggested by Steven: > I believe we should add a ref count to trace_pipe and the per_cpu > trace_pipes, where if they are opened, nothing else can read it. > > Opening trace_pipe locks all per_cpu ref counts, if any of them are > open, then the trace_pipe open will fail (and releases any ref counts > it had taken). > > Opening a per_cpu trace_pipe will up the ref count for just that > CPU buffer. This will allow multiple tasks to read different per_cpu > trace_pipe files, but will prevent the main trace_pipe file from > being opened. But because we only need to know whether per_cpu trace_pipe is open or not, using a cpumask instead of using ref count may be easier. After this patch, users will find that: - Main trace_pipe can be opened by only one user, and if it is opened, all per_cpu trace_pipes cannot be opened; - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu trace_pipe can only be opened by one user. And if one of them is opened, main trace_pipe cannot be opened. Link: https://lore.kernel.org/linux-trace-kernel/20230818022645.1948314-1-zhengyejian1@huawei.com Suggested-by: Steven Rostedt (Google) Signed-off-by: Zheng Yejian Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 55 +++++++++++++++++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 2 ++ 2 files changed, 50 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29a2e4d7886d..8e64aaad5361 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6718,10 +6718,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, #endif +static int open_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + if (cpumask_empty(tr->pipe_cpumask)) { + cpumask_setall(tr->pipe_cpumask); + return 0; + } + } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { + cpumask_set_cpu(cpu, tr->pipe_cpumask); + return 0; + } + return -EBUSY; +} + +static void close_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + WARN_ON(!cpumask_full(tr->pipe_cpumask)); + cpumask_clear(tr->pipe_cpumask); + } else { + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); + cpumask_clear_cpu(cpu, tr->pipe_cpumask); + } +} + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; + int cpu; int ret; ret = tracing_check_open_get_tr(tr); @@ -6729,13 +6755,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return ret; mutex_lock(&trace_types_lock); + cpu = tracing_get_cpu(inode); + ret = open_pipe_on_cpu(tr, cpu); + if (ret) + goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; - __trace_array_put(tr); - goto out; + goto fail_alloc_iter; } trace_seq_init(&iter->seq); @@ -6758,7 +6787,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->tr = tr; iter->array_buffer = &tr->array_buffer; - iter->cpu_file = tracing_get_cpu(inode); + iter->cpu_file = cpu; mutex_init(&iter->mutex); filp->private_data = iter; @@ -6768,12 +6797,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); tr->trace_ref++; -out: + mutex_unlock(&trace_types_lock); return ret; fail: kfree(iter); +fail_alloc_iter: + close_pipe_on_cpu(tr, cpu); +fail_pipe_on_cpu: __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; @@ -6790,7 +6822,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) if (iter->trace->pipe_close) iter->trace->pipe_close(iter); - + close_pipe_on_cpu(tr, iter->cpu_file); mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); @@ -9454,6 +9486,9 @@ static struct trace_array *trace_array_create(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; + if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) + goto out_free_tr; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9495,6 +9530,7 @@ static struct trace_array *trace_array_create(const char *name) out_free_tr: ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); + free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -9597,6 +9633,7 @@ static int __remove_instance(struct trace_array *tr) } kfree(tr->topts); + free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -10394,12 +10431,14 @@ __init static int tracer_alloc_buffers(void) if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; + if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) + goto out_free_savedcmd; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); - goto out_free_savedcmd; + goto out_free_pipe_cpumask; } - if (global_trace.buffer_disabled) tracing_off(); @@ -10452,6 +10491,8 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_pipe_cpumask: + free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 95956f75bea5..73eaec158473 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -377,6 +377,8 @@ struct trace_array { struct list_head events; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ + /* one per_cpu trace_pipe can be opened by only one user */ + cpumask_var_t pipe_cpumask; int ref; int trace_ref; #ifdef CONFIG_FUNCTION_TRACER -- cgit v1.2.3 From 9f5deb551655a4cff04b21ecffdcdab75112da3a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Sat, 26 Aug 2023 17:40:04 +0200 Subject: genirq: Fix software resend lockup and nested resend The switch to using hlist for managing software resend of interrupts broke resend in at least two ways: First, unconditionally adding interrupt descriptors to the resend list can corrupt the list when the descriptor in question has already been added. This causes the resend tasklet to loop indefinitely with interrupts disabled as was recently reported with the Lenovo ThinkPad X13s after threaded NAPI was disabled in the ath11k WiFi driver. This bug is easily fixed by restoring the old semantics of irq_sw_resend() so that it can be called also for descriptors that have already been marked for resend. Second, the offending commit also broke software resend of nested interrupts by simply discarding the code that made sure that such interrupts are retriggered using the parent interrupt. Add back the corresponding code that adds the parent descriptor to the resend list. Fixes: bc06a9e08742 ("genirq: Use hlist for managing resend handlers") Signed-off-by: Johan Hovold Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/lkml/20230809073432.4193-1-johan+linaro@kernel.org/ Link: https://lore.kernel.org/r/20230826154004.1417-1-johan+linaro@kernel.org --- kernel/irq/resend.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index edec335c0a7a..5f2c66860ac6 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -68,11 +68,16 @@ static int irq_sw_resend(struct irq_desc *desc) */ if (!desc->parent_irq) return -EINVAL; + + desc = irq_to_desc(desc->parent_irq); + if (!desc) + return -EINVAL; } /* Add to resend_list and activate the softirq: */ raw_spin_lock(&irq_resend_lock); - hlist_add_head(&desc->resend_node, &irq_resend_list); + if (hlist_unhashed(&desc->resend_node)) + hlist_add_head(&desc->resend_node, &irq_resend_list); raw_spin_unlock(&irq_resend_lock); tasklet_schedule(&resend_tasklet); return 0; -- cgit v1.2.3