From c98cc9797b7009308fff73d41bc1d08642dab77a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2025 10:58:20 -0400 Subject: ring-buffer: Move cpus_read_lock() outside of buffer->mutex Running a modified trace-cmd record --nosplice where it does a mmap of the ring buffer when '--nosplice' is set, caused the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 6.15.0-rc7-test-00002-gfb7d03d8a82f #551 Not tainted ------------------------------------------------------ trace-cmd/1113 is trying to acquire lock: ffff888100062888 (&buffer->mutex){+.+.}-{4:4}, at: ring_buffer_map+0x11c/0xe70 but task is already holding lock: ffff888100a5f9f8 (&cpu_buffer->mapping_lock){+.+.}-{4:4}, at: ring_buffer_map+0xcf/0xe70 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #5 (&cpu_buffer->mapping_lock){+.+.}-{4:4}: __mutex_lock+0x192/0x18c0 ring_buffer_map+0xcf/0xe70 tracing_buffers_mmap+0x1c4/0x3b0 __mmap_region+0xd8d/0x1f70 do_mmap+0x9d7/0x1010 vm_mmap_pgoff+0x20b/0x390 ksys_mmap_pgoff+0x2e9/0x440 do_syscall_64+0x79/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #4 (&mm->mmap_lock){++++}-{4:4}: __might_fault+0xa5/0x110 _copy_to_user+0x22/0x80 _perf_ioctl+0x61b/0x1b70 perf_ioctl+0x62/0x90 __x64_sys_ioctl+0x134/0x190 do_syscall_64+0x79/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #3 (&cpuctx_mutex){+.+.}-{4:4}: __mutex_lock+0x192/0x18c0 perf_event_init_cpu+0x325/0x7c0 perf_event_init+0x52a/0x5b0 start_kernel+0x263/0x3e0 x86_64_start_reservations+0x24/0x30 x86_64_start_kernel+0x95/0xa0 common_startup_64+0x13e/0x141 -> #2 (pmus_lock){+.+.}-{4:4}: __mutex_lock+0x192/0x18c0 perf_event_init_cpu+0xb7/0x7c0 cpuhp_invoke_callback+0x2c0/0x1030 __cpuhp_invoke_callback_range+0xbf/0x1f0 _cpu_up+0x2e7/0x690 cpu_up+0x117/0x170 cpuhp_bringup_mask+0xd5/0x120 bringup_nonboot_cpus+0x13d/0x170 smp_init+0x2b/0xf0 kernel_init_freeable+0x441/0x6d0 kernel_init+0x1e/0x160 ret_from_fork+0x34/0x70 ret_from_fork_asm+0x1a/0x30 -> #1 (cpu_hotplug_lock){++++}-{0:0}: cpus_read_lock+0x2a/0xd0 ring_buffer_resize+0x610/0x14e0 __tracing_resize_ring_buffer.part.0+0x42/0x120 tracing_set_tracer+0x7bd/0xa80 tracing_set_trace_write+0x132/0x1e0 vfs_write+0x21c/0xe80 ksys_write+0xf9/0x1c0 do_syscall_64+0x79/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #0 (&buffer->mutex){+.+.}-{4:4}: __lock_acquire+0x1405/0x2210 lock_acquire+0x174/0x310 __mutex_lock+0x192/0x18c0 ring_buffer_map+0x11c/0xe70 tracing_buffers_mmap+0x1c4/0x3b0 __mmap_region+0xd8d/0x1f70 do_mmap+0x9d7/0x1010 vm_mmap_pgoff+0x20b/0x390 ksys_mmap_pgoff+0x2e9/0x440 do_syscall_64+0x79/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e other info that might help us debug this: Chain exists of: &buffer->mutex --> &mm->mmap_lock --> &cpu_buffer->mapping_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&cpu_buffer->mapping_lock); lock(&mm->mmap_lock); lock(&cpu_buffer->mapping_lock); lock(&buffer->mutex); *** DEADLOCK *** 2 locks held by trace-cmd/1113: #0: ffff888106b847e0 (&mm->mmap_lock){++++}-{4:4}, at: vm_mmap_pgoff+0x192/0x390 #1: ffff888100a5f9f8 (&cpu_buffer->mapping_lock){+.+.}-{4:4}, at: ring_buffer_map+0xcf/0xe70 stack backtrace: CPU: 5 UID: 0 PID: 1113 Comm: trace-cmd Not tainted 6.15.0-rc7-test-00002-gfb7d03d8a82f #551 PREEMPT Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Call Trace: dump_stack_lvl+0x6e/0xa0 print_circular_bug.cold+0x178/0x1be check_noncircular+0x146/0x160 __lock_acquire+0x1405/0x2210 lock_acquire+0x174/0x310 ? ring_buffer_map+0x11c/0xe70 ? ring_buffer_map+0x11c/0xe70 ? __mutex_lock+0x169/0x18c0 __mutex_lock+0x192/0x18c0 ? ring_buffer_map+0x11c/0xe70 ? ring_buffer_map+0x11c/0xe70 ? function_trace_call+0x296/0x370 ? __pfx___mutex_lock+0x10/0x10 ? __pfx_function_trace_call+0x10/0x10 ? __pfx___mutex_lock+0x10/0x10 ? _raw_spin_unlock+0x2d/0x50 ? ring_buffer_map+0x11c/0xe70 ? ring_buffer_map+0x11c/0xe70 ? __mutex_lock+0x5/0x18c0 ring_buffer_map+0x11c/0xe70 ? do_raw_spin_lock+0x12d/0x270 ? find_held_lock+0x2b/0x80 ? _raw_spin_unlock+0x2d/0x50 ? rcu_is_watching+0x15/0xb0 ? _raw_spin_unlock+0x2d/0x50 ? trace_preempt_on+0xd0/0x110 tracing_buffers_mmap+0x1c4/0x3b0 __mmap_region+0xd8d/0x1f70 ? ring_buffer_lock_reserve+0x99/0xff0 ? __pfx___mmap_region+0x10/0x10 ? ring_buffer_lock_reserve+0x99/0xff0 ? __pfx_ring_buffer_lock_reserve+0x10/0x10 ? __pfx_ring_buffer_lock_reserve+0x10/0x10 ? bpf_lsm_mmap_addr+0x4/0x10 ? security_mmap_addr+0x46/0xd0 ? lock_is_held_type+0xd9/0x130 do_mmap+0x9d7/0x1010 ? 0xffffffffc0370095 ? __pfx_do_mmap+0x10/0x10 vm_mmap_pgoff+0x20b/0x390 ? __pfx_vm_mmap_pgoff+0x10/0x10 ? 0xffffffffc0370095 ksys_mmap_pgoff+0x2e9/0x440 do_syscall_64+0x79/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fb0963a7de2 Code: 00 00 00 0f 1f 44 00 00 41 f7 c1 ff 0f 00 00 75 27 55 89 cd 53 48 89 fb 48 85 ff 74 3b 41 89 ea 48 89 df b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 76 5b 5d c3 0f 1f 00 48 8b 05 e1 9f 0d 00 64 RSP: 002b:00007ffdcc8fb878 EFLAGS: 00000246 ORIG_RAX: 0000000000000009 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb0963a7de2 RDX: 0000000000000001 RSI: 0000000000001000 RDI: 0000000000000000 RBP: 0000000000000001 R08: 0000000000000006 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffdcc8fbe68 R14: 00007fb096628000 R15: 00005633e01a5c90 The issue is that cpus_read_lock() is taken within buffer->mutex. The memory mapped pages are taken with the mmap_lock held. The buffer->mutex is taken within the cpu_buffer->mapping_lock. There's quite a chain with all these locks, where the deadlock can be fixed by moving the cpus_read_lock() outside the taking of the buffer->mutex. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250527105820.0f45d045@gandalf.local.home Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3f9bf562beea..ca1a8e706004 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2849,6 +2849,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; + /* + * Keep CPUs from coming online while resizing to synchronize + * with new per CPU buffers being created. + */ + guard(cpus_read_lock)(); + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); @@ -2893,7 +2899,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cond_resched(); } - cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2933,7 +2938,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2961,8 +2965,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - cpus_read_lock(); - /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); @@ -2981,7 +2983,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - cpus_read_unlock(); } out: -- cgit v1.2.3 From 4fc78a7c9ca994e1da5d3940704d4e8f0ea8c5e4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 28 May 2025 12:15:55 -0400 Subject: ring-buffer: Do not trigger WARN_ON() due to a commit_overrun When reading a memory mapped buffer the reader page is just swapped out with the last page written in the write buffer. If the reader page is the same as the commit buffer (the buffer that is currently being written to) it was assumed that it should never have missed events. If it does, it triggers a WARN_ON_ONCE(). But there just happens to be one scenario where this can legitimately happen. That is on a commit_overrun. A commit overrun is when an interrupt preempts an event being written to the buffer and then the interrupt adds so many new events that it fills and wraps the buffer back to the commit. Any new events would then be dropped and be reported as "missed_events". In this case, the next page to read is the commit buffer and after the swap of the reader page, the reader page will be the commit buffer, but this time there will be missed events and this triggers the following warning: ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1127 at kernel/trace/ring_buffer.c:7357 ring_buffer_map_get_reader+0x49a/0x780 Modules linked in: kvm_intel kvm irqbypass CPU: 2 UID: 0 PID: 1127 Comm: trace-cmd Not tainted 6.15.0-rc7-test-00004-g478bc2824b45-dirty #564 PREEMPT Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:ring_buffer_map_get_reader+0x49a/0x780 Code: 00 00 00 48 89 fe 48 c1 ee 03 80 3c 2e 00 0f 85 ec 01 00 00 4d 3b a6 a8 00 00 00 0f 85 8a fd ff ff 48 85 c0 0f 84 55 fe ff ff <0f> 0b e9 4e fe ff ff be 08 00 00 00 4c 89 54 24 58 48 89 54 24 50 RSP: 0018:ffff888121787dc0 EFLAGS: 00010002 RAX: 00000000000006a2 RBX: ffff888100062800 RCX: ffffffff8190cb49 RDX: ffff888126934c00 RSI: 1ffff11020200a15 RDI: ffff8881010050a8 RBP: dffffc0000000000 R08: 0000000000000000 R09: ffffed1024d26982 R10: ffff888126934c17 R11: ffff8881010050a8 R12: ffff888126934c00 R13: ffff8881010050b8 R14: ffff888101005000 R15: ffff888126930008 FS: 00007f95c8cd7540(0000) GS:ffff8882b576e000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f95c8de4dc0 CR3: 0000000128452002 CR4: 0000000000172ef0 Call Trace: ? __pfx_ring_buffer_map_get_reader+0x10/0x10 tracing_buffers_ioctl+0x283/0x370 __x64_sys_ioctl+0x134/0x190 do_syscall_64+0x79/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f95c8de48db Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1c 48 8b 44 24 18 64 48 2b 04 25 28 00 00 RSP: 002b:00007ffe037ba110 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffe037bb2b0 RCX: 00007f95c8de48db RDX: 0000000000000000 RSI: 0000000000005220 RDI: 0000000000000006 RBP: 00007ffe037ba180 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffe037bb6f8 R14: 00007f95c9065000 R15: 00005575c7492c90 irq event stamp: 5080 hardirqs last enabled at (5079): [] _raw_spin_unlock_irqrestore+0x50/0x70 hardirqs last disabled at (5080): [] _raw_spin_lock_irqsave+0x63/0x70 softirqs last enabled at (4182): [] handle_softirqs+0x552/0x710 softirqs last disabled at (4159): [] __irq_exit_rcu+0x107/0x210 ---[ end trace 0000000000000000 ]--- The above was triggered by running on a kernel with both lockdep and KASAN as well as kmemleak enabled and executing the following command: # perf record -o perf-test.dat -a -- trace-cmd record --nosplice -e all -p function hackbench 50 With perf interjecting a lot of interrupts and trace-cmd enabling all events as well as function tracing, with lockdep, KASAN and kmemleak enabled, it could cause an interrupt preempting an event being written to add enough events to wrap the buffer. trace-cmd was modified to have --nosplice use mmap instead of reading the buffer. The way to differentiate this case from the normal case of there only being one page written to where the swap of the reader page received that one page (which is the commit page), check if the tail page is on the reader page. The difference between the commit page and the tail page is that the tail page is where new writes go to, and the commit page holds the first write that hasn't been committed yet. In the case of an interrupt preempting the write of an event and filling the buffer, it would move the tail page but not the commit page. Have the warning only trigger if the tail page is also on the reader page, and also print out the number of events dropped by a commit overrun as that can not yet be safely added to the page so that the reader can see there were events dropped. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250528121555.2066527e@gandalf.local.home Fixes: fe832be05a8ee ("ring-buffer: Have mmapped ring buffer keep track of missed events") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ca1a8e706004..683aa57870fe 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7285,8 +7285,8 @@ consume: /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; - if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - if (missed_events) { + if (missed_events) { + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* @@ -7307,13 +7307,23 @@ consume: local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); + } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, + "Reader on commit with %ld missed events", + missed_events)) { + /* + * There shouldn't be any missed events if the tail_page + * is on the reader page. But if the tail page is not on the + * reader page and the commit_page is, that would mean that + * there's a commit_overrun (an interrupt preempted an + * addition of an event and then filled the buffer + * with new events). In this case it's not an + * error, but it should still be reported. + * + * TODO: Add missed events to the page for user space to know. + */ + pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", + cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } - } else { - /* - * There really shouldn't be any missed events if the commit - * is on the reader page. - */ - WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; -- cgit v1.2.3 From c2a08311427cc8c5c547e5d700cb2f93d63fcb2a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 1 Apr 2025 16:25:54 -0400 Subject: ring-buffer: Allow reserve_mem persistent ring buffers to be mmapped When the persistent ring buffer is created from the memory returned by reserve_mem there is nothing prohibiting it to be memory mapped to user space. The memory is the same as the pages allocated by alloc_page(). The way the memory is managed by the ring buffer code is slightly different though and needs to be addressed. The persistent memory uses the page->id for its own purpose where as the user mmap buffer currently uses that for the subbuf array mapped to user space. If the buffer is a persistent buffer, use the page index into that buffer as the identifier instead of the page->id. That is, the page->id for a persistent buffer, represents the order of the buffer is in the link list. ->id == 0 means it is the reader page. When a reader page is swapped, the new reader page's ->id gets zero, and the old reader page gets the ->id of the page that it swapped with. The user space mapping has the ->id is the index of where it was mapped in user space and does not change while it is mapped. Since the persistent buffer is fixed in its location, the index of where a page is in the memory range can be used as the "id" to put in the meta page array, and it can be mapped in the same order to user space as it is in the persistent memory. A new rb_page_id() helper function is used to get and set the id depending on if the page is a normal memory allocated buffer or a physical memory mapped buffer. Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Jann Horn Link: https://lore.kernel.org/20250401203332.246646011@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 49 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 683aa57870fe..e40f5c6d7908 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6003,6 +6003,39 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +/* + * When the buffer is memory mapped to user space, each sub buffer + * has a unique id that is used by the meta data to tell the user + * where the current reader page is. + * + * For a normal allocated ring buffer, the id is saved in the buffer page + * id field, and updated via this function. + * + * But for a fixed memory mapped buffer, the id is already assigned for + * fixed memory ording in the memory layout and can not be used. Instead + * the index of where the page lies in the memory layout is used. + * + * For the normal pages, set the buffer page id with the passed in @id + * value and return that. + * + * For fixed memory mapped pages, get the page index in the memory layout + * and return that as the id. + */ +static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage, int id) +{ + /* + * For boot buffers, the id is the index, + * otherwise, set the buffer page with this id + */ + if (cpu_buffer->ring_meta) + id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page); + else + bpage->id = id; + + return id; +} + static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; @@ -6011,7 +6044,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) return; meta->reader.read = cpu_buffer->reader_page->read; - meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, + cpu_buffer->reader_page->id); + meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); @@ -6927,23 +6962,29 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; + int cnt = 0; int id = 0; - subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; - cpu_buffer->reader_page->id = id++; + id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); + subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; + cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { + id = rb_page_id(cpu_buffer, subbuf, id); + if (WARN_ON(id >= nr_subbufs)) break; subbuf_ids[id] = (unsigned long)subbuf->page; - subbuf->id = id; rb_inc_page(&subbuf); id++; + cnt++; } while (subbuf != first_subbuf); + WARN_ON(cnt != nr_subbufs); + /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; -- cgit v1.2.3 From 2d22216521b12c4b09479aa684504092cd67970b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2025 15:51:16 -0400 Subject: ring-buffer: Removed unnecessary if() goto out where out is the next line In the function ring_buffer_discard_commit() there's an if statement that jumps to the next line: if (rb_try_to_discard(cpu_buffer, event)) goto out; out: This was caused by the change that modified the way timestamps were taken in interrupt context, and removed the code between the if statement and the goto, but failed to update the conditional logic. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250527155116.227f35be@gandalf.local.home Fixes: a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e40f5c6d7908..a01fc42f2a13 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4685,10 +4685,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); - if (rb_try_to_discard(cpu_buffer, event)) - goto out; - - out: + rb_try_to_discard(cpu_buffer, event); rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); -- cgit v1.2.3 From f115d2b70bff2665f67fa8e8dc5ed9452b696c44 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2025 14:57:53 -0400 Subject: ring-buffer: Remove jump to out label in ring_buffer_swap_cpu() The function ring_buffer_swap_cpu() has a bunch of jumps to the label out that simply returns "ret". There's no reason to jump to a label that simply returns a value. Just return directly from there. This goes back to almost the beginning when commit 8aabee573dff ("ring-buffer: remove unneeded get_online_cpus") was introduced. That commit removed a put_online_cpus() from that label, but never updated all the jumps to it that now no longer needed to do anything but return a value. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250527145753.6b45d840@gandalf.local.home Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a01fc42f2a13..c912ce4c8c89 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6315,37 +6315,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - goto out; + return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ - if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { - ret = -EBUSY; - goto out; - } + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) + return -EBUSY; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) - goto out; + return -EINVAL; if (buffer_a->subbuf_order != buffer_b->subbuf_order) - goto out; - - ret = -EAGAIN; + return -EINVAL; if (atomic_read(&buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&buffer_b->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_b->record_disabled)) - goto out; + return -EAGAIN; /* * We can't do a synchronize_rcu here because this @@ -6382,7 +6378,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); -out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -- cgit v1.2.3 From f0d8cbc8ccc6299046ceb5023a2af9e2b75c4106 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2025 14:46:23 -0400 Subject: ring-buffer: Simplify reset_disabled_cpu_buffer() with use of guard() Use guard(raw_spinlock_irqsave)() in reset_disabled_cpu_buffer() to simplify the locking. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250527144623.77a9cc47@gandalf.local.home Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c912ce4c8c89..58cca10f482b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6113,21 +6113,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + return; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); - - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** -- cgit v1.2.3 From b2e7c6ed26e90fab1e5e626071e54e3b9ec9cb5a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2025 14:52:16 -0400 Subject: ring-buffer: Simplify ring_buffer_read_page() with guard() The function ring_buffer_read_page() had two gotos. One was simply returning "ret" and the other was unlocking the reader_lock. There's no reason to use goto to simply return the "ret" variable. Instead just return the value. The jump to the unlocking of the reader_lock can be replaced by guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock). With these two changes the "ret" variable is no longer used and can be removed. The return value on non-error is what was read and is stored in the "read" variable. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250527145216.0187cf36@gandalf.local.home Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 58cca10f482b..931bbcc6640f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6531,38 +6531,37 @@ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; - unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; - int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -1; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - goto out; + return -1; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) - goto out; + return -1; + if (data_page->order != buffer->subbuf_order) - goto out; + return -1; bpage = data_page->data; if (!bpage) - goto out; + return -1; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out_unlock; + return -1; event = rb_reader_event(cpu_buffer); @@ -6596,7 +6595,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) - goto out_unlock; + return -1; if (len > (commit - read)) len = (commit - read); @@ -6605,7 +6604,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, size = rb_event_ts_length(event); if (len < size) - goto out_unlock; + return -1; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -6663,7 +6662,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (reader->real_end) local_set(&bpage->commit, reader->real_end); } - ret = read; cpu_buffer->lost_events = 0; @@ -6690,11 +6688,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); - out_unlock: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - out: - return ret; + return read; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -- cgit v1.2.3 From 60bc720e10eac397b3adae975095df77bc368b88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2025 12:20:09 -0400 Subject: ring-buffer: Make ring_buffer_{un}map() simpler with guard(mutex) Convert the taking of the buffer->mutex and the cpu_buffer->mapping_lock over to guard(mutex) and simplify the ring_buffer_map() and ring_buffer_unmap() functions. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250527122009.267efb72@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 931bbcc6640f..ef1cca8125df 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7161,36 +7161,34 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags, *subbuf_ids; - int err = 0; + int err; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); - mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) - goto unlock; + return err; /* subbuf_ids include the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); - err = -ENOMEM; - goto unlock; + return -ENOMEM; } atomic_inc(&cpu_buffer->resize_disabled); @@ -7218,35 +7216,29 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, atomic_dec(&cpu_buffer->resize_disabled); } -unlock: - mutex_unlock(&buffer->mutex); - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { - err = -ENODEV; - goto out; + return -ENODEV; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); - goto out; + return 0; } - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ @@ -7261,12 +7253,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); - mutex_unlock(&buffer->mutex); - -out: - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) -- cgit v1.2.3 From 99d232804405e35d7a9af1536a057578a1442e81 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2025 14:31:44 -0400 Subject: ring-buffer: Simplify functions with __free(kfree) to free allocations The function rb_allocate_pages() allocates cpu_buffer and on error needs to free it. It has a single return. Use __free(kfree) and return directly on errors and have the return use return_ptr(cpu_buffer). The function alloc_buffer() allocates buffer and on error needs to free it. It has a single return. Use __free(kfree) and return directly on errors and have the return use return_ptr(buffer). The function __rb_map_vma() allocates a temporary array "pages". Have it use __free() and not worry about freeing it when returning. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250527143144.6edc4625@gandalf.local.home Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ef1cca8125df..295b6fbfa81a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2226,7 +2226,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL; struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; struct page *page; @@ -2252,7 +2252,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) - goto fail_free_buffer; + return NULL; rb_check_bpage(cpu_buffer, bpage); @@ -2318,13 +2318,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_head_page_activate(cpu_buffer); } - return cpu_buffer; + return_ptr(cpu_buffer); fail_free_reader: free_buffer_page(cpu_buffer->reader_page); - fail_free_buffer: - kfree(cpu_buffer); return NULL; } @@ -2359,7 +2357,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, unsigned long scratch_size, struct lock_class_key *key) { - struct trace_buffer *buffer; + struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; int subbuf_size; int bsize; @@ -2373,7 +2371,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) - goto fail_free_buffer; + return NULL; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); @@ -2472,7 +2470,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); - return buffer; + return_ptr(buffer); fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { @@ -2484,8 +2482,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - fail_free_buffer: - kfree(buffer); return NULL; } @@ -7057,7 +7053,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; - struct page **pages; + struct page **pages __free(kfree) = NULL; int p = 0, s = 0; int err; @@ -7125,10 +7121,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct page *page; int off = 0; - if (WARN_ON_ONCE(s >= nr_subbufs)) { - err = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(s >= nr_subbufs)) + return -EINVAL; page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); @@ -7143,9 +7137,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -out: - kfree(pages); - return err; } #else -- cgit v1.2.3