From 912da2c384d510ce40c5af9c3adc316afa4ec547 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Oct 2024 14:32:42 -0400 Subject: ring-buffer: Do not have boot mapped buffers hook to CPU hotplug The boot mapped ring buffer has its buffer mapped at a fixed location found at boot up. It is not dynamic. It cannot grow or be expanded when new CPUs come online. Do not hook fixed memory mapped ring buffers to the CPU hotplug callback, otherwise it can cause a crash when it tries to add the buffer to the memory that is already fully occupied. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241008143242.25e20801@gandalf.local.home Fixes: be68d63a139bd ("ring-buffer: Add ring_buffer_alloc_range()") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 77dc0b25140e..fb04445f92c3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2337,9 +2337,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers[cpu]) goto fail_free_buffers; - ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); - if (ret < 0) - goto fail_free_buffers; + /* If already mapped, do not hook to CPU hotplug */ + if (!start) { + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; + } mutex_init(&buffer->mutex); -- cgit v1.2.3 From ad6b5b6ea9b764018249285a4fe0a2226bef4caa Mon Sep 17 00:00:00 2001 From: Tyrone Wu Date: Fri, 11 Oct 2024 00:08:02 +0000 Subject: bpf: Fix unpopulated path_size when uprobe_multi fields unset Previously when retrieving `bpf_link_info.uprobe_multi` with `path` and `path_size` fields unset, the `path_size` field is not populated (remains 0). This behavior was inconsistent with how other input/output string buffer fields work, as the field should be populated in cases when: - both buffer and length are set (currently works as expected) - both buffer and length are unset (not working as expected) This patch now fills the `path_size` field when `path` and `path_size` are unset. Fixes: e56fdbfb06e2 ("bpf: Add link_info support for uprobe multi link") Signed-off-by: Tyrone Wu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241011000803.681190-1-wudevelops@gmail.com --- kernel/trace/bpf_trace.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a582cd25ca87..3bd402fa62a4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3133,7 +3133,8 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_uprobe_multi_link *umulti_link; u32 ucount = info->uprobe_multi.count; int err = 0, i; - long left; + char *p, *buf; + long left = 0; if (!upath ^ !upath_size) return -EINVAL; @@ -3147,26 +3148,23 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; - if (upath) { - char *p, *buf; - - upath_size = min_t(u32, upath_size, PATH_MAX); - - buf = kmalloc(upath_size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - p = d_path(&umulti_link->path, buf, upath_size); - if (IS_ERR(p)) { - kfree(buf); - return PTR_ERR(p); - } - upath_size = buf + upath_size - p; - left = copy_to_user(upath, p, upath_size); + upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX; + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { kfree(buf); - if (left) - return -EFAULT; - info->uprobe_multi.path_size = upath_size; + return PTR_ERR(p); } + upath_size = buf + upath_size - p; + + if (upath) + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; if (!uoffsets && !ucookies && !uref_ctr_offsets) return 0; -- cgit v1.2.3 From cd9626e9ebc77edec33023fe95dab4b04ffc819d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Oct 2024 11:38:10 +0200 Subject: sched/fair: Fix external p->on_rq users Sean noted that ever since commit 152e11f6df29 ("sched/fair: Implement delayed dequeue") KVM's preemption notifiers have started mis-classifying preemption vs blocking. Notably p->on_rq is no longer sufficient to determine if a task is runnable or blocked -- the aforementioned commit introduces tasks that remain on the runqueue even through they will not run again, and should be considered blocked for many cases. Add the task_is_runnable() helper to classify things and audit all external users of the p->on_rq state. Also add a few comments. Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Reported-by: Sean Christopherson Tested-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20241010091843.GK33184@noisy.programming.kicks-ass.net --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c4ad7cd7e778..1469dd8075fa 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tr->max_latency = 0; - while (p->on_rq) { + while (task_is_runnable(p)) { /* * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, -- cgit v1.2.3 From 2cf9733891a460a16a209fcc20fbd138605b13b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Oct 2024 16:52:24 -0400 Subject: ring-buffer: Fix refcount setting of boot mapped buffers A ring buffer which has its buffered mapped at boot up to fixed memory should not be freed. Other buffers can be. The ref counting setup was wrong for both. It made the not mapped buffers ref count have zero, and the boot mapped buffer a ref count of 1. But an normally allocated buffer should be 1, where it can be removed. Keep the ref count of a normal boot buffer with its setup ref count (do not decrement it), and increment the fixed memory boot mapped buffer's ref count. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241011165224.33dd2624@gandalf.local.home Fixes: e645535a954ad ("tracing: Add option to use memmapped memory for trace boot instance") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c69ca1f1088..a8f52b6527ca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10621,10 +10621,10 @@ __init static void enable_instances(void) * cannot be deleted by user space, so keep the reference * to it. */ - if (start) + if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT; - else - trace_array_put(tr); + tr->ref++; + } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); -- cgit v1.2.3 From 09661f75e75cb6c1d2d8326a70c311d46729235f Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 15 Oct 2024 13:24:29 +0200 Subject: ring-buffer: Fix reader locking when changing the sub buffer order The function ring_buffer_subbuf_order_set() updates each ring_buffer_per_cpu and installs new sub buffers that match the requested page order. This operation may be invoked concurrently with readers that rely on some of the modified data, such as the head bit (RB_PAGE_HEAD), or the ring_buffer_per_cpu.pages and reader_page pointers. However, no exclusive access is acquired by ring_buffer_subbuf_order_set(). Modifying the mentioned data while a reader also operates on them can then result in incorrect memory access and various crashes. Fix the problem by taking the reader_lock when updating a specific ring_buffer_per_cpu in ring_buffer_subbuf_order_set(). Link: https://lore.kernel.org/linux-trace-kernel/20240715145141.5528-1-petr.pavlu@suse.com/ Link: https://lore.kernel.org/linux-trace-kernel/20241010195849.2f77cc3f@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20241011112850.17212b25@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241015112440.26987-1-petr.pavlu@suse.com Fixes: 8e7b58c27b3c ("ring-buffer: Just update the subbuffers when changing their allocation order") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fb04445f92c3..3ea4f7bb1837 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6728,39 +6728,38 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } for_each_buffer_cpu(buffer, cpu) { + struct buffer_data_page *old_free_data_page; + struct list_head old_pages; + unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); - /* Now walk the list and free all the old sub buffers */ - list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - /* The above loop stopped an the last page needing to be freed */ - bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); - free_buffer_page(bpage); - - /* Free the current reader page */ - free_buffer_page(cpu_buffer->reader_page); + /* + * Collect buffers from the cpu_buffer pages list and the + * reader_page on old_pages, so they can be freed later when not + * under a spinlock. The pages list is a linked list with no + * head, adding old_pages turns it into a regular list with + * old_pages being the head. + */ + list_add(&old_pages, cpu_buffer->pages); + list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); - /* The cpu_buffer pages are a link list with no head */ + /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; - cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; - cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; - - /* Clear the new_pages list */ - INIT_LIST_HEAD(&cpu_buffer->new_pages); + list_del_init(&cpu_buffer->new_pages); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); @@ -6769,11 +6768,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; - free_pages((unsigned long)cpu_buffer->free_page, old_order); + old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* Free old sub buffers */ + list_for_each_entry_safe(bpage, tmp, &old_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + free_pages((unsigned long)old_free_data_page, old_order); + rb_check_pages(cpu_buffer); } -- cgit v1.2.3 From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:43:00 -0400 Subject: fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks The function graph infrastructure allocates a shadow stack for every task when enabled. This includes the idle tasks. The first time the function graph is invoked, the shadow stacks are created and never freed until the task exits. This includes the idle tasks. Only the idle tasks that were for online CPUs had their shadow stacks created when function graph tracing started. If function graph tracing is enabled and a CPU comes online, the idle task representing that CPU will not have its shadow stack created, and all function graph tracing for that idle task will be silently dropped. Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks. This will include idle tasks for CPUs that come online during tracing. This issue can be reproduced by: # cd /sys/kernel/tracing # echo 0 > /sys/devices/system/cpu/cpu1/online # echo 0 > set_ftrace_pid # echo function_graph > current_tracer # echo 1 > options/funcgraph-proc # echo 1 > /sys/devices/system/cpu/cpu1 # grep '' per_cpu/cpu1/trace | head Before, nothing would show up. After: 1) -0 | 0.811 us | __enqueue_entity(); 1) -0 | 5.626 us | } /* enqueue_entity */ 1) -0 | | dl_server_update_idle_time() { 1) -0 | | dl_scaled_delta_exec() { 1) -0 | 0.450 us | arch_scale_cpu_capacity(); 1) -0 | 1.242 us | } 1) -0 | 1.908 us | } 1) -0 | | dl_server_start() { 1) -0 | | enqueue_dl_entity() { 1) -0 | | task_contending() { Note, if tracing stops and restarts, the old way would then initialize the onlined CPUs. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Thomas Gleixner Link: https://lore.kernel.org/20241018214300.6df82178@rorschach Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d7d4fb403f6f..43f4e3f57438 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret, cpu; + int ret; ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); @@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch) fgraph_direct_gops = &fgraph_stub; } +/* The cpu_boot init_task->ret_stack will never be freed */ +static int fgraph_cpu_init(unsigned int cpu) +{ + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + return 0; +} + int register_ftrace_graph(struct fgraph_ops *gops) { + static bool fgraph_initialized; int command = 0; int ret = 0; int i = -1; mutex_lock(&ftrace_lock); + if (!fgraph_initialized) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init", + fgraph_cpu_init, NULL); + if (ret < 0) { + pr_warn("fgraph: Error to init cpu hotplug support\n"); + return ret; + } + fgraph_initialized = true; + ret = 0; + } + if (!fgraph_array[0]) { /* The array must always have real data on it */ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) -- cgit v1.2.3 From fae4078c289a2f24229c0de652249948b1cd6bdb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:52:12 -0400 Subject: fgraph: Allocate ret_stack_list with proper size The ret_stack_list is an array of ret_stack shadow stacks for the function graph usage. When the first function graph is enabled, all tasks in the system get a shadow stack. The ret_stack_list is a 32 element array of pointers to these shadow stacks. It allocates the shadow stack in batches (32 stacks at a time), assigns them to running tasks, and continues until all tasks are covered. When the function graph shadow stack changed from an array of ftrace_ret_stack structures to an array of longs, the allocation of ret_stack_list went from allocating an array of 32 elements to just a block defined by SHADOW_STACK_SIZE. Luckily, that's defined as PAGE_SIZE and is much more than enough to hold 32 pointers. But it is way overkill for the amount needed to allocate. Change the allocation of ret_stack_list back to a kcalloc() of FTRACE_RETSTACK_ALLOC_SIZE pointers. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241018215212.23f13f40@rorschach Fixes: 42675b723b484 ("function_graph: Convert ret_stack to a series of longs") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 43f4e3f57438..41e7a15dcb50 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1162,7 +1162,8 @@ static int start_graph_tracing(void) unsigned long **ret_stack_list; int ret; - ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; -- cgit v1.2.3