From ae2ad293d6be143ad223f5f947cca07bcbe42595 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 20 Jun 2023 16:07:47 +0800 Subject: sched/fair: Use recent_used_cpu to test p->cpus_ptr When checking whether a recently used CPU can be a potential idle candidate, recent_used_cpu should be used to test p->cpus_ptr as p->recent_used_cpu is not equal to recent_used_cpu and candidate decision is made based on recent_used_cpu here. Fixes: 89aafd67f28c ("sched/fair: Use prev instead of new target as recent_used_cpu") Signed-off-by: Miaohe Lin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20230620080747.359122-1-linmiaohe@huawei.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a80a73909dc2..b3e25be58e2b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7174,7 +7174,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { return recent_used_cpu; } -- cgit v1.2.3 From aff037078ecaecf34a7c2afab1341815f90fba5e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 29 Jun 2023 17:56:12 -0700 Subject: sched/psi: use kernfs polling functions for PSI trigger polling Destroying psi trigger in cgroup_file_release causes UAF issues when a cgroup is removed from under a polling process. This is happening because cgroup removal causes a call to cgroup_file_release while the actual file is still alive. Destroying the trigger at this point would also destroy its waitqueue head and if there is still a polling process on that file accessing the waitqueue, it will step on the freed pointer: do_select vfs_poll do_rmdir cgroup_rmdir kernfs_drain_open_files cgroup_file_release cgroup_pressure_release psi_trigger_destroy wake_up_pollfree(&t->event_wait) // vfs_poll is unblocked synchronize_rcu kfree(t) poll_freewait -> UAF access to the trigger's waitqueue head Patch [1] fixed this issue for epoll() case using wake_up_pollfree(), however the same issue exists for synchronous poll() case. The root cause of this issue is that the lifecycles of the psi trigger's waitqueue and of the file associated with the trigger are different. Fix this by using kernfs_generic_poll function when polling on cgroup-specific psi triggers. It internally uses kernfs_open_node->poll waitqueue head with its lifecycle tied to the file's lifecycle. This also renders the fix in [1] obsolete, so revert it. [1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()") Fixes: 0e94682b73bf ("psi: introduce psi monitor") Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com/ Reported-by: Lu Jialin Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230630005612.1014540-1-surenb@google.com --- kernel/cgroup/cgroup.c | 2 +- kernel/sched/psi.c | 29 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bfe3cd8ccf36..f55a40db065f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3730,7 +3730,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_psi(cgrp); - new = psi_trigger_create(psi, buf, res, of->file); + new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 81fca77397f6..9bb3f2b3ccfc 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -493,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, continue; /* Generate an event */ - if (cmpxchg(&t->event, 0, 1) == 0) - wake_up_interruptible(&t->event_wait); + if (cmpxchg(&t->event, 0, 1) == 0) { + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + } t->last_event_time = now; /* Reset threshold breach flag once event got generated */ t->pending_event = false; @@ -1271,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, enum psi_res res, struct file *file) +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of) { struct psi_trigger *t; enum psi_states state; @@ -1331,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; - init_waitqueue_head(&t->event_wait); + t->of = of; + if (!of) + init_waitqueue_head(&t->event_wait); t->pending_event = false; t->aggregator = privileged ? PSI_POLL : PSI_AVGS; @@ -1388,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t) * being accessed later. Can happen if cgroup is deleted from under a * polling process. */ - wake_up_pollfree(&t->event_wait); + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); if (t->aggregator == PSI_AVGS) { mutex_lock(&group->avgs_lock); @@ -1465,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - poll_wait(file, &t->event_wait, wait); + if (t->of) + kernfs_generic_poll(t->of, wait); + else + poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; @@ -1535,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, res, file); + new = psi_trigger_create(&psi_system, buf, res, file, NULL); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); -- cgit v1.2.3 From c9e4bf607d8c431452ef362c00e62ce999bbae93 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 11 Jul 2023 13:48:12 +0200 Subject: PM: hibernate: Fix writing maj:min to /sys/power/resume resume_store() first calls lookup_bdev() and after tries to handle maj:min, but it does not reset the error before, hence if you will write maj:min you will get ENOENT: # echo 259:2 >| /sys/power/resume bash: echo: write error: No such file or directory This also should fix hiberation via systemd, since it uses this way. Fixes: 1e8c813b083c4 ("PM: hibernate: don't use early_lookup_bdev in resume_store") Signed-off-by: Azat Khuzhin Reviewed-by: Christoph Hellwig [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f62e89d0d906..e1b4bfa938dd 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1179,6 +1179,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, unsigned maj, min, offset; char *p, dummy; + error = 0; if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { -- cgit v1.2.3 From 3a8395b565b5b4f019b3dc182be4c4541eb35ac8 Mon Sep 17 00:00:00 2001 From: Chungkai Yang Date: Wed, 5 Jul 2023 16:59:07 +0800 Subject: PM: QoS: Restore support for default value on frequency QoS Commit 8d36694245f2 ("PM: QoS: Add check to make sure CPU freq is non-negative") makes sure CPU freq is non-negative to avoid negative value converting to unsigned data type. However, when the value is PM_QOS_DEFAULT_VALUE, pm_qos_update_target specifically uses c->default_value which is set to FREQ_QOS_MIN/MAX_DEFAULT_VALUE when cpufreq_policy_alloc is executed, for this case handling. Adding check for PM_QOS_DEFAULT_VALUE to let default setting work will fix this problem. Fixes: 8d36694245f2 ("PM: QoS: Add check to make sure CPU freq is non-negative") Link: https://lore.kernel.org/lkml/20230626035144.19717-1-Chung-kai.Yang@mediatek.com/ Link: https://lore.kernel.org/lkml/20230627071727.16646-1-Chung-kai.Yang@mediatek.com/ Link: https://lore.kernel.org/lkml/CAJZ5v0gxNOWhC58PHeUhW_tgf6d1fGJVZ1x91zkDdht11yUv-A@mail.gmail.com/ Signed-off-by: Chungkai Yang Cc: 6.0+ # 6.0+ Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index af51ed6d45ef..782d3b41c1f3 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -426,6 +426,11 @@ late_initcall(cpu_latency_qos_init); /* Definitions related to the frequency QoS below. */ +static inline bool freq_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + /** * freq_constraints_init - Initialize frequency QoS constraints. * @qos: Frequency QoS constraints to initialize. @@ -531,7 +536,7 @@ int freq_qos_add_request(struct freq_constraints *qos, { int ret; - if (IS_ERR_OR_NULL(qos) || !req || value < 0) + if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value)) return -EINVAL; if (WARN(freq_qos_request_active(req), @@ -563,7 +568,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request); */ int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) { - if (!req || new_value < 0) + if (!req || freq_qos_value_invalid(new_value)) return -EINVAL; if (WARN(!freq_qos_request_active(req), -- cgit v1.2.3 From 8cc32a9bbf2934d90762d9de0187adcb5ad46a11 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 28 Jun 2023 11:19:26 -0700 Subject: kallsyms: strip LTO-only suffixes from promoted global functions Commit 6eb4bd92c1ce ("kallsyms: strip LTO suffixes from static functions") stripped all function/variable suffixes started with '.' regardless of whether those suffixes are generated at LTO mode or not. In fact, as far as I know, in LTO mode, when a static function/variable is promoted to the global scope, '.llvm.<...>' suffix is added. The existing mechanism breaks live patch for a LTO kernel even if no .llvm.<...> symbols are involved. For example, for the following kernel symbols: $ grep bpf_verifier_vlog /proc/kallsyms ffffffff81549f60 t bpf_verifier_vlog ffffffff8268b430 d bpf_verifier_vlog._entry ffffffff8282a958 d bpf_verifier_vlog._entry_ptr ffffffff82e12a1f d bpf_verifier_vlog.__already_done 'bpf_verifier_vlog' is a static function. '_entry', '_entry_ptr' and '__already_done' are static variables used inside 'bpf_verifier_vlog', so llvm promotes them to file-level static with prefix 'bpf_verifier_vlog.'. Note that the func-level to file-level static function promotion also happens without LTO. Given a symbol name 'bpf_verifier_vlog', with LTO kernel, current mechanism will return 4 symbols to live patch subsystem which current live patching subsystem cannot handle it. With non-LTO kernel, only one symbol is returned. In [1], we have a lengthy discussion, the suggestion is to separate two cases: (1). new symbols with suffix which are generated regardless of whether LTO is enabled or not, and (2). new symbols with suffix generated only when LTO is enabled. The cleanup_symbol_name() should only remove suffixes for case (2). Case (1) should not be changed so it can work uniformly with or without LTO. This patch removed LTO-only suffix '.llvm.<...>' so live patching and tracing should work the same way for non-LTO kernel. The cleanup_symbol_name() in scripts/kallsyms.c is also changed to have the same filtering pattern so both kernel and kallsyms tool have the same expectation on the order of symbols. [1] https://lore.kernel.org/live-patching/20230615170048.2382735-1-song@kernel.org/T/#u Fixes: 6eb4bd92c1ce ("kallsyms: strip LTO suffixes from static functions") Reported-by: Song Liu Signed-off-by: Yonghong Song Reviewed-by: Zhen Lei Reviewed-by: Nick Desaulniers Acked-by: Song Liu Link: https://lore.kernel.org/r/20230628181926.4102448-1-yhs@fb.com Signed-off-by: Kees Cook --- kernel/kallsyms.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 7982cc9d497c..016d997131d4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -174,11 +174,10 @@ static bool cleanup_symbol_name(char *s) * LLVM appends various suffixes for local functions and variables that * must be promoted to global scope as part of LTO. This can break * hooking of static functions with kprobes. '.' is not a valid - * character in an identifier in C. Suffixes observed: + * character in an identifier in C. Suffixes only in LLVM LTO observed: * - foo.llvm.[0-9a-f]+ - * - foo.[0-9a-f]+ */ - res = strchr(s, '.'); + res = strstr(s, ".llvm."); if (res) { *res = '\0'; return true; -- cgit v1.2.3 From d5f28bb1ce04636b285726ee2a5afa3a514025f4 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 8 Jul 2023 01:38:03 +0900 Subject: fprobes: Add a comment why fprobe_kprobe_handler exits if kprobe is running Add a comment the reason why fprobe_kprobe_handler() exits if any other kprobe is running. Link: https://lore.kernel.org/all/168874788299.159442.2485957441413653858.stgit@devnote2/ Suggested-by: Steven Rostedt Link: https://lore.kernel.org/all/20230706120916.3c6abf15@gandalf.local.home/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/fprobe.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 2571f7f3d5f2..59321d22f43e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -100,6 +100,12 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, return; } + /* + * This user handler is shared with other kprobes and is not expected to be + * called recursively. So if any other kprobe handler is running, this will + * exit as kprobe does. See the section 'Share the callbacks with kprobes' + * in Documentation/trace/fprobe.rst for more information. + */ if (unlikely(kprobe_running())) { fp->nmissed++; goto recursion_unlock; -- cgit v1.2.3 From 66bcf65d6cf0ca6540e2341e88ee7ef02dbdda08 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:29 +0900 Subject: tracing/probes: Fix to avoid double count of the string length on the array If an array is specified with the ustring or symstr, the length of the strings are accumlated on both of 'ret' and 'total', which means the length is double counted. Just set the length to the 'ret' value for avoiding double counting. Link: https://lore.kernel.org/all/168908492917.123124.15076463491122036025.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/8819b154-2ba1-43c3-98a2-cbde20892023@moroto.mountain/ Fixes: 88903c464321 ("tracing/probe: Add ustring type for user-space string") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 00707630788d..4735c5cb76fa 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -156,11 +156,11 @@ stage3: code++; goto array; case FETCH_OP_ST_USTRING: - ret += fetch_store_strlen_user(val + code->offset); + ret = fetch_store_strlen_user(val + code->offset); code++; goto array; case FETCH_OP_ST_SYMSTR: - ret += fetch_store_symstrlen(val + code->offset); + ret = fetch_store_symstrlen(val + code->offset); code++; goto array; default: -- cgit v1.2.3 From b41326b5e0f82e93592c4366359917b5d67b529f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:38 +0900 Subject: tracing/probes: Fix not to count error code to total length Fix not to count the error code (which is minus value) to the total used length of array, because it can mess up the return code of process_fetch_insn_bottom(). Also clear the 'ret' value because it will be used for calculating next data_loc entry. Link: https://lore.kernel.org/all/168908493827.123124.2175257289106364229.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/8819b154-2ba1-43c3-98a2-cbde20892023@moroto.mountain/ Fixes: 9b960a38835f ("tracing: probeevent: Unify fetch_insn processing common part") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4735c5cb76fa..ed9d57c6b041 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -204,6 +204,8 @@ stage3: array: /* the last stage: Loop on array */ if (code->op == FETCH_OP_LP_ARRAY) { + if (ret < 0) + ret = 0; total += ret; if (++i < code->param) { code = s3; -- cgit v1.2.3 From e38e2c6a9efc435f9de344b7c91f7697e01b47d5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:48 +0900 Subject: tracing/probes: Fix to update dynamic data counter if fetcharg uses it Fix to update dynamic data counter ('dyndata') and max length ('maxlen') only if the fetcharg uses the dynamic data. Also get out arg->dynamic from unlikely(). This makes dynamic data address wrong if process_fetch_insn() returns error on !arg->dynamic case. Link: https://lore.kernel.org/all/168908494781.123124.8160245359962103684.stgit@devnote2/ Suggested-by: Steven Rostedt Link: https://lore.kernel.org/all/20230710233400.5aaf024e@gandalf.local.home/ Fixes: 9178412ddf5a ("tracing: probeevent: Return consumed bytes of dynamic area") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index ed9d57c6b041..185da001f4c3 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -267,11 +267,13 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) { - *dl = make_data_loc(0, dyndata - base); - } else { - dyndata += ret; - maxlen -= ret; + if (arg->dynamic) { + if (unlikely(ret < 0)) { + *dl = make_data_loc(0, dyndata - base); + } else { + dyndata += ret; + maxlen -= ret; + } } } } -- cgit v1.2.3 From 4ed8f337dee32df71435689c19d22e4ee846e15a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:57 +0900 Subject: Revert "tracing: Add "(fault)" name injection to kernel probes" This reverts commit 2e9906f84fc7c99388bb7123ade167250d50f1c0. It was turned out that commit 2e9906f84fc7 ("tracing: Add "(fault)" name injection to kernel probes") did not work correctly and probe events still show just '(fault)' (instead of '"(fault)"'). Also, current '(fault)' is more explicit that it faulted. This also moves FAULT_STRING macro to trace.h so that synthetic event can keep using it, and uses it in trace_probe.c too. Link: https://lore.kernel.org/all/168908495772.123124.1250788051922100079.stgit@devnote2/ Link: https://lore.kernel.org/all/20230706230642.3793a593@rorschach.local.home/ Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_probe.c | 2 +- kernel/trace/trace_probe_kernel.h | 31 ++++++------------------------- 3 files changed, 9 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..eee1f3ca4749 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -113,6 +113,8 @@ enum trace_type { #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define FAULT_STRING "(fault)" + #define HIST_STACKTRACE_DEPTH 16 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..591399ddcee5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -65,7 +65,7 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) int len = *(u32 *)data >> 16; if (!len) - trace_seq_puts(s, "(fault)"); + trace_seq_puts(s, FAULT_STRING); else trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index c4e1d4c03a85..6deae2ce34f8 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -2,8 +2,6 @@ #ifndef __TRACE_PROBE_KERNEL_H_ #define __TRACE_PROBE_KERNEL_H_ -#define FAULT_STRING "(fault)" - /* * This depends on trace_probe.h, but can not include it due to * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. @@ -15,16 +13,8 @@ static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; - int ret; - ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); - /* - * strnlen_user_nofault returns zero on fault, insert the - * FAULT_STRING when that occurs. - */ - if (ret <= 0) - return strlen(FAULT_STRING) + 1; - return ret; + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); } /* Return the length of string -- including null terminal byte */ @@ -44,18 +34,7 @@ fetch_store_strlen(unsigned long addr) len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - /* For faults, return enough to hold the FAULT_STRING */ - return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; -} - -static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) -{ - if (ret >= 0) { - *(u32 *)dest = make_data_loc(ret, __dest - base); - } else { - strscpy(__dest, FAULT_STRING, len); - ret = strlen(__dest) + 1; - } + return (ret < 0) ? ret : len; } /* @@ -76,7 +55,8 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } @@ -107,7 +87,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } -- cgit v1.2.3 From 797311bce5c2ac90b8d65e357603cfd410d36ebb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:16:07 +0900 Subject: tracing/probes: Fix to record 0-length data_loc in fetch_store_string*() if fails Fix to record 0-length data to data_loc in fetch_store_string*() if it fails to get the string data. Currently those expect that the data_loc is updated by store_trace_args() if it returns the error code. However, that does not work correctly if the argument is an array of strings. In that case, store_trace_args() only clears the first entry of the array (which may have no error) and leaves other entries. So it should be cleared by fetch_store_string*() itself. Also, 'dyndata' and 'maxlen' in store_trace_args() should be updated only if it is used (ret > 0 and argument is a dynamic data.) Link: https://lore.kernel.org/all/168908496683.123124.4761206188794205601.stgit@devnote2/ Fixes: 40b53b771806 ("tracing: probeevent: Add array type support") Cc: stable@vger.kernel.org Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe_kernel.h | 13 +++++++++---- kernel/trace/trace_probe_tmpl.h | 10 +++------- kernel/trace/trace_uprobe.c | 3 ++- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index 6deae2ce34f8..bb723eefd7b7 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -37,6 +37,13 @@ fetch_store_strlen(unsigned long addr) return (ret < 0) ? ret : len; } +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base) +{ + if (ret < 0) + ret = 0; + *(u32 *)dest = make_data_loc(ret, __dest - base); +} + /* * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf * with max length and relative data location. @@ -55,8 +62,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base); return ret; } @@ -87,8 +93,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base); return ret; } diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 185da001f4c3..3935b347f874 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -267,13 +267,9 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (arg->dynamic) { - if (unlikely(ret < 0)) { - *dl = make_data_loc(0, dyndata - base); - } else { - dyndata += ret; - maxlen -= ret; - } + if (arg->dynamic && likely(ret > 0)) { + dyndata += ret; + maxlen -= ret; } } } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b92e34ff0c8..7b47e9a2c010 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -170,7 +170,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) */ ret++; *(u32 *)dest = make_data_loc(ret, (void *)dst - base); - } + } else + *(u32 *)dest = make_data_loc(0, (void *)dst - base); return ret; } -- cgit v1.2.3 From 636e348353a7cc52609fdba5ff3270065da140d5 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 9 Jul 2023 01:33:44 +0200 Subject: prctl: move PR_GET_AUXV out of PR_MCE_KILL Somehow PR_GET_AUXV got added into PR_MCE_KILL's switch when the patch was applied [1]. Thus move it out of the switch, to the place the patch added it. In the recently released v6.4 kernel some user could, in principle, be already using this feature by mapping the right page and passing the PR_GET_AUXV constant as a pointer: prctl(PR_MCE_KILL, PR_GET_AUXV, ...) So this does change the behavior for users. We could keep the bug since the other subcases in PR_MCE_KILL (PR_MCE_KILL_CLEAR and PR_MCE_KILL_SET) do not overlap. However, v6.4 may be recent enough (2 weeks old) that moving the lines (rather than just adding a new case) does not break anybody? Moreover, the documentation in man-pages was just committed today [2]. Link: https://lkml.kernel.org/r/20230708233344.361854-1-ojeda@kernel.org Fixes: ddc65971bb67 ("prctl: add PR_GET_AUXV to copy auxv to userspace") Link: https://lore.kernel.org/all/d81864a7f7f43bca6afa2a09fc2e850e4050ab42.1680611394.git.josh@joshtriplett.org/ [1] Link: https://git.kernel.org/pub/scm/docs/man-pages/man-pages.git/commit/?id=8cf0c06bfd3c2b219b044d4151c96f0da50af9ad [2] Signed-off-by: Miguel Ojeda Cc: Josh Triplett Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 05f838929e72..2410e3999ebe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2535,11 +2535,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; - case PR_GET_AUXV: - if (arg4 || arg5) - return -EINVAL; - error = prctl_get_auxv((void __user *)arg2, arg3); - break; default: return -EINVAL; } @@ -2694,6 +2689,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; + case PR_GET_AUXV: + if (arg4 || arg5) + return -EINVAL; + error = prctl_get_auxv((void __user *)arg2, arg3); + break; #ifdef CONFIG_KSM case PR_SET_MEMORY_MERGE: if (arg3 || arg4 || arg5) -- cgit v1.2.3 From ba7b3e7d5f9014be65879ede8fd599cb222901c9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 17 Jul 2023 21:45:28 +0530 Subject: bpf: Fix subprog idx logic in check_max_stack_depth The assignment to idx in check_max_stack_depth happens once we see a bpf_pseudo_call or bpf_pseudo_func. This is not an issue as the rest of the code performs a few checks and then pushes the frame to the frame stack, except the case of async callbacks. If the async callback case causes the loop iteration to be skipped, the idx assignment will be incorrect on the next iteration of the loop. The value stored in the frame stack (as the subprogno of the current subprog) will be incorrect. This leads to incorrect checks and incorrect tail_call_reachable marking. Save the target subprog in a new variable and only assign to idx once we are done with the is_async_cb check which may skip pushing of frame to the frame stack and subsequent stack depth checks and tail call markings. Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230717161530.1238-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 930b5555cfd3..e682056dd144 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5621,7 +5621,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - int next_insn; + int next_insn, sidx; if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; @@ -5631,14 +5631,14 @@ continue_func: /* find the callee */ next_insn = i + insn[i].imm + 1; - idx = find_subprog(env, next_insn); - if (idx < 0) { + sidx = find_subprog(env, next_insn); + if (sidx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", next_insn); return -EFAULT; } - if (subprog[idx].is_async_cb) { - if (subprog[idx].has_tail_call) { + if (subprog[sidx].is_async_cb) { + if (subprog[sidx].has_tail_call) { verbose(env, "verifier bug. subprog has tail_call and async cb\n"); return -EFAULT; } @@ -5647,6 +5647,7 @@ continue_func: continue; } i = next_insn; + idx = sidx; if (subprog[idx].has_tail_call) tail_call_reachable = true; -- cgit v1.2.3 From b5e9ad522c4ccd32d322877515cff8d47ed731b9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 17 Jul 2023 21:45:29 +0530 Subject: bpf: Repeat check_max_stack_depth for async callbacks While the check_max_stack_depth function explores call chains emanating from the main prog, which is typically enough to cover all possible call chains, it doesn't explore those rooted at async callbacks unless the async callback will have been directly called, since unlike non-async callbacks it skips their instruction exploration as they don't contribute to stack depth. It could be the case that the async callback leads to a callchain which exceeds the stack depth, but this is never reachable while only exploring the entry point from main subprog. Hence, repeat the check for the main subprog *and* all async callbacks marked by the symbolic execution pass of the verifier, as execution of the program may begin at any of them. Consider functions with following stack depths: main: 256 async: 256 foo: 256 main: rX = async bpf_timer_set_callback(...) async: foo() Here, async is not descended as it does not contribute to stack depth of main (since it is referenced using bpf_pseudo_func and not bpf_pseudo_call). However, when async is invoked asynchronously, it will end up breaching the MAX_BPF_STACK limit by calling foo. Hence, in addition to main, we also need to explore call chains beginning at all async callback subprogs in a program. Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230717161530.1238-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e682056dd144..02a021c524ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5573,16 +5573,17 @@ static int update_stack_depth(struct bpf_verifier_env *env, * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ -static int check_max_stack_depth(struct bpf_verifier_env *env) +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) { - int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + int depth = 0, frame = 0, i, subprog_end; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j; + i = subprog[idx].start; process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack @@ -5683,6 +5684,22 @@ continue_func: goto continue_func; } +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *si = env->subprog_info; + int ret; + + for (int i = 0; i < env->subprog_cnt; i++) { + if (!i || si[i].is_async_cb) { + ret = check_max_stack_depth_subprog(env, i); + if (ret < 0) + return ret; + } + continue; + } + return 0; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON static int get_callee_stack_depth(struct bpf_verifier_env *env, const struct bpf_insn *insn, int idx) -- cgit v1.2.3