From c98cc9797b7009308fff73d41bc1d08642dab77a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2025 10:58:20 -0400
Subject: ring-buffer: Move cpus_read_lock() outside of buffer->mutex

Running a modified trace-cmd record --nosplice where it does a mmap of the
ring buffer when '--nosplice' is set, caused the following lockdep splat:

 ======================================================
 WARNING: possible circular locking dependency detected
 6.15.0-rc7-test-00002-gfb7d03d8a82f #551 Not tainted
 ------------------------------------------------------
 trace-cmd/1113 is trying to acquire lock:
 ffff888100062888 (&buffer->mutex){+.+.}-{4:4}, at: ring_buffer_map+0x11c/0xe70

 but task is already holding lock:
 ffff888100a5f9f8 (&cpu_buffer->mapping_lock){+.+.}-{4:4}, at: ring_buffer_map+0xcf/0xe70

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #5 (&cpu_buffer->mapping_lock){+.+.}-{4:4}:
        __mutex_lock+0x192/0x18c0
        ring_buffer_map+0xcf/0xe70
        tracing_buffers_mmap+0x1c4/0x3b0
        __mmap_region+0xd8d/0x1f70
        do_mmap+0x9d7/0x1010
        vm_mmap_pgoff+0x20b/0x390
        ksys_mmap_pgoff+0x2e9/0x440
        do_syscall_64+0x79/0x1c0
        entry_SYSCALL_64_after_hwframe+0x76/0x7e

 -> #4 (&mm->mmap_lock){++++}-{4:4}:
        __might_fault+0xa5/0x110
        _copy_to_user+0x22/0x80
        _perf_ioctl+0x61b/0x1b70
        perf_ioctl+0x62/0x90
        __x64_sys_ioctl+0x134/0x190
        do_syscall_64+0x79/0x1c0
        entry_SYSCALL_64_after_hwframe+0x76/0x7e

 -> #3 (&cpuctx_mutex){+.+.}-{4:4}:
        __mutex_lock+0x192/0x18c0
        perf_event_init_cpu+0x325/0x7c0
        perf_event_init+0x52a/0x5b0
        start_kernel+0x263/0x3e0
        x86_64_start_reservations+0x24/0x30
        x86_64_start_kernel+0x95/0xa0
        common_startup_64+0x13e/0x141

 -> #2 (pmus_lock){+.+.}-{4:4}:
        __mutex_lock+0x192/0x18c0
        perf_event_init_cpu+0xb7/0x7c0
        cpuhp_invoke_callback+0x2c0/0x1030
        __cpuhp_invoke_callback_range+0xbf/0x1f0
        _cpu_up+0x2e7/0x690
        cpu_up+0x117/0x170
        cpuhp_bringup_mask+0xd5/0x120
        bringup_nonboot_cpus+0x13d/0x170
        smp_init+0x2b/0xf0
        kernel_init_freeable+0x441/0x6d0
        kernel_init+0x1e/0x160
        ret_from_fork+0x34/0x70
        ret_from_fork_asm+0x1a/0x30

 -> #1 (cpu_hotplug_lock){++++}-{0:0}:
        cpus_read_lock+0x2a/0xd0
        ring_buffer_resize+0x610/0x14e0
        __tracing_resize_ring_buffer.part.0+0x42/0x120
        tracing_set_tracer+0x7bd/0xa80
        tracing_set_trace_write+0x132/0x1e0
        vfs_write+0x21c/0xe80
        ksys_write+0xf9/0x1c0
        do_syscall_64+0x79/0x1c0
        entry_SYSCALL_64_after_hwframe+0x76/0x7e

 -> #0 (&buffer->mutex){+.+.}-{4:4}:
        __lock_acquire+0x1405/0x2210
        lock_acquire+0x174/0x310
        __mutex_lock+0x192/0x18c0
        ring_buffer_map+0x11c/0xe70
        tracing_buffers_mmap+0x1c4/0x3b0
        __mmap_region+0xd8d/0x1f70
        do_mmap+0x9d7/0x1010
        vm_mmap_pgoff+0x20b/0x390
        ksys_mmap_pgoff+0x2e9/0x440
        do_syscall_64+0x79/0x1c0
        entry_SYSCALL_64_after_hwframe+0x76/0x7e

 other info that might help us debug this:

 Chain exists of:
   &buffer->mutex --> &mm->mmap_lock --> &cpu_buffer->mapping_lock

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&cpu_buffer->mapping_lock);
                                lock(&mm->mmap_lock);
                                lock(&cpu_buffer->mapping_lock);
   lock(&buffer->mutex);

  *** DEADLOCK ***

 2 locks held by trace-cmd/1113:
  #0: ffff888106b847e0 (&mm->mmap_lock){++++}-{4:4}, at: vm_mmap_pgoff+0x192/0x390
  #1: ffff888100a5f9f8 (&cpu_buffer->mapping_lock){+.+.}-{4:4}, at: ring_buffer_map+0xcf/0xe70

 stack backtrace:
 CPU: 5 UID: 0 PID: 1113 Comm: trace-cmd Not tainted 6.15.0-rc7-test-00002-gfb7d03d8a82f #551 PREEMPT
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
 Call Trace:
  <TASK>
  dump_stack_lvl+0x6e/0xa0
  print_circular_bug.cold+0x178/0x1be
  check_noncircular+0x146/0x160
  __lock_acquire+0x1405/0x2210
  lock_acquire+0x174/0x310
  ? ring_buffer_map+0x11c/0xe70
  ? ring_buffer_map+0x11c/0xe70
  ? __mutex_lock+0x169/0x18c0
  __mutex_lock+0x192/0x18c0
  ? ring_buffer_map+0x11c/0xe70
  ? ring_buffer_map+0x11c/0xe70
  ? function_trace_call+0x296/0x370
  ? __pfx___mutex_lock+0x10/0x10
  ? __pfx_function_trace_call+0x10/0x10
  ? __pfx___mutex_lock+0x10/0x10
  ? _raw_spin_unlock+0x2d/0x50
  ? ring_buffer_map+0x11c/0xe70
  ? ring_buffer_map+0x11c/0xe70
  ? __mutex_lock+0x5/0x18c0
  ring_buffer_map+0x11c/0xe70
  ? do_raw_spin_lock+0x12d/0x270
  ? find_held_lock+0x2b/0x80
  ? _raw_spin_unlock+0x2d/0x50
  ? rcu_is_watching+0x15/0xb0
  ? _raw_spin_unlock+0x2d/0x50
  ? trace_preempt_on+0xd0/0x110
  tracing_buffers_mmap+0x1c4/0x3b0
  __mmap_region+0xd8d/0x1f70
  ? ring_buffer_lock_reserve+0x99/0xff0
  ? __pfx___mmap_region+0x10/0x10
  ? ring_buffer_lock_reserve+0x99/0xff0
  ? __pfx_ring_buffer_lock_reserve+0x10/0x10
  ? __pfx_ring_buffer_lock_reserve+0x10/0x10
  ? bpf_lsm_mmap_addr+0x4/0x10
  ? security_mmap_addr+0x46/0xd0
  ? lock_is_held_type+0xd9/0x130
  do_mmap+0x9d7/0x1010
  ? 0xffffffffc0370095
  ? __pfx_do_mmap+0x10/0x10
  vm_mmap_pgoff+0x20b/0x390
  ? __pfx_vm_mmap_pgoff+0x10/0x10
  ? 0xffffffffc0370095
  ksys_mmap_pgoff+0x2e9/0x440
  do_syscall_64+0x79/0x1c0
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7fb0963a7de2
 Code: 00 00 00 0f 1f 44 00 00 41 f7 c1 ff 0f 00 00 75 27 55 89 cd 53 48 89 fb 48 85 ff 74 3b 41 89 ea 48 89 df b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 76 5b 5d c3 0f 1f 00 48 8b 05 e1 9f 0d 00 64
 RSP: 002b:00007ffdcc8fb878 EFLAGS: 00000246 ORIG_RAX: 0000000000000009
 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb0963a7de2
 RDX: 0000000000000001 RSI: 0000000000001000 RDI: 0000000000000000
 RBP: 0000000000000001 R08: 0000000000000006 R09: 0000000000000000
 R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000000
 R13: 00007ffdcc8fbe68 R14: 00007fb096628000 R15: 00005633e01a5c90
  </TASK>

The issue is that cpus_read_lock() is taken within buffer->mutex. The
memory mapped pages are taken with the mmap_lock held. The buffer->mutex
is taken within the cpu_buffer->mapping_lock. There's quite a chain with
all these locks, where the deadlock can be fixed by moving the
cpus_read_lock() outside the taking of the buffer->mutex.

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Link: https://lore.kernel.org/20250527105820.0f45d045@gandalf.local.home
Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3f9bf562beea..ca1a8e706004 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2849,6 +2849,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 	if (nr_pages < 2)
 		nr_pages = 2;
 
+	/*
+	 * Keep CPUs from coming online while resizing to synchronize
+	 * with new per CPU buffers being created.
+	 */
+	guard(cpus_read_lock)();
+
 	/* prevent another thread from changing buffer sizes */
 	mutex_lock(&buffer->mutex);
 	atomic_inc(&buffer->resizing);
@@ -2893,7 +2899,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 			cond_resched();
 		}
 
-		cpus_read_lock();
 		/*
 		 * Fire off all the required work handlers
 		 * We can't schedule on offline CPUs, but it's not necessary
@@ -2933,7 +2938,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 			cpu_buffer->nr_pages_to_update = 0;
 		}
 
-		cpus_read_unlock();
 	} else {
 		cpu_buffer = buffer->buffers[cpu_id];
 
@@ -2961,8 +2965,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 			goto out_err;
 		}
 
-		cpus_read_lock();
-
 		/* Can't run something on an offline CPU. */
 		if (!cpu_online(cpu_id))
 			rb_update_pages(cpu_buffer);
@@ -2981,7 +2983,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 		}
 
 		cpu_buffer->nr_pages_to_update = 0;
-		cpus_read_unlock();
 	}
 
  out:
-- 
cgit v1.2.3


From 4fc78a7c9ca994e1da5d3940704d4e8f0ea8c5e4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 28 May 2025 12:15:55 -0400
Subject: ring-buffer: Do not trigger WARN_ON() due to a commit_overrun

When reading a memory mapped buffer the reader page is just swapped out
with the last page written in the write buffer. If the reader page is the
same as the commit buffer (the buffer that is currently being written to)
it was assumed that it should never have missed events. If it does, it
triggers a WARN_ON_ONCE().

But there just happens to be one scenario where this can legitimately
happen. That is on a commit_overrun. A commit overrun is when an interrupt
preempts an event being written to the buffer and then the interrupt adds
so many new events that it fills and wraps the buffer back to the commit.
Any new events would then be dropped and be reported as "missed_events".

In this case, the next page to read is the commit buffer and after the
swap of the reader page, the reader page will be the commit buffer, but
this time there will be missed events and this triggers the following
warning:

 ------------[ cut here ]------------
 WARNING: CPU: 2 PID: 1127 at kernel/trace/ring_buffer.c:7357 ring_buffer_map_get_reader+0x49a/0x780
 Modules linked in: kvm_intel kvm irqbypass
 CPU: 2 UID: 0 PID: 1127 Comm: trace-cmd Not tainted 6.15.0-rc7-test-00004-g478bc2824b45-dirty #564 PREEMPT
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
 RIP: 0010:ring_buffer_map_get_reader+0x49a/0x780
 Code: 00 00 00 48 89 fe 48 c1 ee 03 80 3c 2e 00 0f 85 ec 01 00 00 4d 3b a6 a8 00 00 00 0f 85 8a fd ff ff 48 85 c0 0f 84 55 fe ff ff <0f> 0b e9 4e fe ff ff be 08 00 00 00 4c 89 54 24 58 48 89 54 24 50
 RSP: 0018:ffff888121787dc0 EFLAGS: 00010002
 RAX: 00000000000006a2 RBX: ffff888100062800 RCX: ffffffff8190cb49
 RDX: ffff888126934c00 RSI: 1ffff11020200a15 RDI: ffff8881010050a8
 RBP: dffffc0000000000 R08: 0000000000000000 R09: ffffed1024d26982
 R10: ffff888126934c17 R11: ffff8881010050a8 R12: ffff888126934c00
 R13: ffff8881010050b8 R14: ffff888101005000 R15: ffff888126930008
 FS:  00007f95c8cd7540(0000) GS:ffff8882b576e000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007f95c8de4dc0 CR3: 0000000128452002 CR4: 0000000000172ef0
 Call Trace:
  <TASK>
  ? __pfx_ring_buffer_map_get_reader+0x10/0x10
  tracing_buffers_ioctl+0x283/0x370
  __x64_sys_ioctl+0x134/0x190
  do_syscall_64+0x79/0x1c0
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7f95c8de48db
 Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1c 48 8b 44 24 18 64 48 2b 04 25 28 00 00
 RSP: 002b:00007ffe037ba110 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
 RAX: ffffffffffffffda RBX: 00007ffe037bb2b0 RCX: 00007f95c8de48db
 RDX: 0000000000000000 RSI: 0000000000005220 RDI: 0000000000000006
 RBP: 00007ffe037ba180 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
 R13: 00007ffe037bb6f8 R14: 00007f95c9065000 R15: 00005575c7492c90
  </TASK>
 irq event stamp: 5080
 hardirqs last  enabled at (5079): [<ffffffff83e0adb0>] _raw_spin_unlock_irqrestore+0x50/0x70
 hardirqs last disabled at (5080): [<ffffffff83e0aa83>] _raw_spin_lock_irqsave+0x63/0x70
 softirqs last  enabled at (4182): [<ffffffff81516122>] handle_softirqs+0x552/0x710
 softirqs last disabled at (4159): [<ffffffff815163f7>] __irq_exit_rcu+0x107/0x210
 ---[ end trace 0000000000000000 ]---

The above was triggered by running on a kernel with both lockdep and KASAN
as well as kmemleak enabled and executing the following command:

 # perf record -o perf-test.dat -a -- trace-cmd record --nosplice  -e all -p function hackbench 50

With perf interjecting a lot of interrupts and trace-cmd enabling all
events as well as function tracing, with lockdep, KASAN and kmemleak
enabled, it could cause an interrupt preempting an event being written to
add enough events to wrap the buffer. trace-cmd was modified to have
--nosplice use mmap instead of reading the buffer.

The way to differentiate this case from the normal case of there only
being one page written to where the swap of the reader page received that
one page (which is the commit page), check if the tail page is on the
reader page. The difference between the commit page and the tail page is
that the tail page is where new writes go to, and the commit page holds
the first write that hasn't been committed yet. In the case of an
interrupt preempting the write of an event and filling the buffer, it
would move the tail page but not the commit page.

Have the warning only trigger if the tail page is also on the reader page,
and also print out the number of events dropped by a commit overrun as
that can not yet be safely added to the page so that the reader can see
there were events dropped.

Cc: stable@vger.kernel.org
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Link: https://lore.kernel.org/20250528121555.2066527e@gandalf.local.home
Fixes: fe832be05a8ee ("ring-buffer: Have mmapped ring buffer keep track of missed events")
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ca1a8e706004..683aa57870fe 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7285,8 +7285,8 @@ consume:
 	/* Check if any events were dropped */
 	missed_events = cpu_buffer->lost_events;
 
-	if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
-		if (missed_events) {
+	if (missed_events) {
+		if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
 			struct buffer_data_page *bpage = reader->page;
 			unsigned int commit;
 			/*
@@ -7307,13 +7307,23 @@ consume:
 				local_add(RB_MISSED_STORED, &bpage->commit);
 			}
 			local_add(RB_MISSED_EVENTS, &bpage->commit);
+		} else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
+				      "Reader on commit with %ld missed events",
+				      missed_events)) {
+			/*
+			 * There shouldn't be any missed events if the tail_page
+			 * is on the reader page. But if the tail page is not on the
+			 * reader page and the commit_page is, that would mean that
+			 * there's a commit_overrun (an interrupt preempted an
+			 * addition of an event and then filled the buffer
+			 * with new events). In this case it's not an
+			 * error, but it should still be reported.
+			 *
+			 * TODO: Add missed events to the page for user space to know.
+			 */
+			pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n",
+				cpu, missed_events, cpu_buffer->reader_page->page->time_stamp);
 		}
-	} else {
-		/*
-		 * There really shouldn't be any missed events if the commit
-		 * is on the reader page.
-		 */
-		WARN_ON_ONCE(missed_events);
 	}
 
 	cpu_buffer->lost_events = 0;
-- 
cgit v1.2.3


From c2a08311427cc8c5c547e5d700cb2f93d63fcb2a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 1 Apr 2025 16:25:54 -0400
Subject: ring-buffer: Allow reserve_mem persistent ring buffers to be mmapped

When the persistent ring buffer is created from the memory returned by
reserve_mem there is nothing prohibiting it to be memory mapped to user
space. The memory is the same as the pages allocated by alloc_page().

The way the memory is managed by the ring buffer code is slightly
different though and needs to be addressed.

The persistent memory uses the page->id for its own purpose where as the
user mmap buffer currently uses that for the subbuf array mapped to user
space. If the buffer is a persistent buffer, use the page index into that
buffer as the identifier instead of the page->id.

That is, the page->id for a persistent buffer, represents the order of the
buffer is in the link list. ->id == 0 means it is the reader page.
When a reader page is swapped, the new reader page's ->id gets zero, and
the old reader page gets the ->id of the page that it swapped with.

The user space mapping has the ->id is the index of where it was mapped in
user space and does not change while it is mapped.

Since the persistent buffer is fixed in its location, the index of where
a page is in the memory range can be used as the "id" to put in the meta
page array, and it can be mapped in the same order to user space as it is
in the persistent memory.

A new rb_page_id() helper function is used to get and set the id depending
on if the page is a normal memory allocated buffer or a physical memory
mapped buffer.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/20250401203332.246646011@goodmis.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 49 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 683aa57870fe..e40f5c6d7908 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6003,6 +6003,39 @@ static void rb_clear_buffer_page(struct buffer_page *page)
 	page->read = 0;
 }
 
+/*
+ * When the buffer is memory mapped to user space, each sub buffer
+ * has a unique id that is used by the meta data to tell the user
+ * where the current reader page is.
+ *
+ * For a normal allocated ring buffer, the id is saved in the buffer page
+ * id field, and updated via this function.
+ *
+ * But for a fixed memory mapped buffer, the id is already assigned for
+ * fixed memory ording in the memory layout and can not be used. Instead
+ * the index of where the page lies in the memory layout is used.
+ *
+ * For the normal pages, set the buffer page id with the passed in @id
+ * value and return that.
+ *
+ * For fixed memory mapped pages, get the page index in the memory layout
+ * and return that as the id.
+ */
+static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer,
+		      struct buffer_page *bpage, int id)
+{
+	/*
+	 * For boot buffers, the id is the index,
+	 * otherwise, set the buffer page with this id
+	 */
+	if (cpu_buffer->ring_meta)
+		id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page);
+	else
+		bpage->id = id;
+
+	return id;
+}
+
 static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct trace_buffer_meta *meta = cpu_buffer->meta_page;
@@ -6011,7 +6044,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
 		return;
 
 	meta->reader.read = cpu_buffer->reader_page->read;
-	meta->reader.id = cpu_buffer->reader_page->id;
+	meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page,
+				     cpu_buffer->reader_page->id);
+
 	meta->reader.lost_events = cpu_buffer->lost_events;
 
 	meta->entries = local_read(&cpu_buffer->entries);
@@ -6927,23 +6962,29 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
 	struct trace_buffer_meta *meta = cpu_buffer->meta_page;
 	unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
 	struct buffer_page *first_subbuf, *subbuf;
+	int cnt = 0;
 	int id = 0;
 
-	subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
-	cpu_buffer->reader_page->id = id++;
+	id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
+	subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+	cnt++;
 
 	first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
 	do {
+		id = rb_page_id(cpu_buffer, subbuf, id);
+
 		if (WARN_ON(id >= nr_subbufs))
 			break;
 
 		subbuf_ids[id] = (unsigned long)subbuf->page;
-		subbuf->id = id;
 
 		rb_inc_page(&subbuf);
 		id++;
+		cnt++;
 	} while (subbuf != first_subbuf);
 
+	WARN_ON(cnt != nr_subbufs);
+
 	/* install subbuf ID to kern VA translation */
 	cpu_buffer->subbuf_ids = subbuf_ids;
 
-- 
cgit v1.2.3


From 2d22216521b12c4b09479aa684504092cd67970b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2025 15:51:16 -0400
Subject: ring-buffer: Removed unnecessary if() goto out where out is the next
 line

In the function ring_buffer_discard_commit() there's an if statement that
jumps to the next line:

	if (rb_try_to_discard(cpu_buffer, event))
		goto out;
 out:

This was caused by the change that modified the way timestamps were taken
in interrupt context, and removed the code between the if statement and
the goto, but failed to update the conditional logic.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250527155116.227f35be@gandalf.local.home
Fixes: a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp")
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e40f5c6d7908..a01fc42f2a13 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4685,10 +4685,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
 	RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
 
 	rb_decrement_entry(cpu_buffer, event);
-	if (rb_try_to_discard(cpu_buffer, event))
-		goto out;
-
- out:
+	rb_try_to_discard(cpu_buffer, event);
 	rb_end_commit(cpu_buffer);
 
 	trace_recursive_unlock(cpu_buffer);
-- 
cgit v1.2.3


From f115d2b70bff2665f67fa8e8dc5ed9452b696c44 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2025 14:57:53 -0400
Subject: ring-buffer: Remove jump to out label in ring_buffer_swap_cpu()

The function ring_buffer_swap_cpu() has a bunch of jumps to the label out
that simply returns "ret". There's no reason to jump to a label that
simply returns a value. Just return directly from there.

This goes back to almost the beginning when commit 8aabee573dff
("ring-buffer: remove unneeded get_online_cpus") was introduced. That
commit removed a put_online_cpus() from that label, but never updated all
the jumps to it that now no longer needed to do anything but return a
value.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250527145753.6b45d840@gandalf.local.home
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a01fc42f2a13..c912ce4c8c89 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6315,37 +6315,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
 
 	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
 	    !cpumask_test_cpu(cpu, buffer_b->cpumask))
-		goto out;
+		return -EINVAL;
 
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
 	/* It's up to the callers to not try to swap mapped buffers */
-	if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
-		ret = -EBUSY;
-		goto out;
-	}
+	if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped))
+		return -EBUSY;
 
 	/* At least make sure the two buffers are somewhat the same */
 	if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
-		goto out;
+		return -EINVAL;
 
 	if (buffer_a->subbuf_order != buffer_b->subbuf_order)
-		goto out;
-
-	ret = -EAGAIN;
+		return -EINVAL;
 
 	if (atomic_read(&buffer_a->record_disabled))
-		goto out;
+		return -EAGAIN;
 
 	if (atomic_read(&buffer_b->record_disabled))
-		goto out;
+		return -EAGAIN;
 
 	if (atomic_read(&cpu_buffer_a->record_disabled))
-		goto out;
+		return -EAGAIN;
 
 	if (atomic_read(&cpu_buffer_b->record_disabled))
-		goto out;
+		return -EAGAIN;
 
 	/*
 	 * We can't do a synchronize_rcu here because this
@@ -6382,7 +6378,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
 out_dec:
 	atomic_dec(&cpu_buffer_a->record_disabled);
 	atomic_dec(&cpu_buffer_b->record_disabled);
-out:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
-- 
cgit v1.2.3


From f0d8cbc8ccc6299046ceb5023a2af9e2b75c4106 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2025 14:46:23 -0400
Subject: ring-buffer: Simplify reset_disabled_cpu_buffer() with use of guard()

Use guard(raw_spinlock_irqsave)() in reset_disabled_cpu_buffer() to
simplify the locking.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250527144623.77a9cc47@gandalf.local.home
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c912ce4c8c89..58cca10f482b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6113,21 +6113,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 /* Must have disabled the cpu buffer then done a synchronize_rcu */
 static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 
 	if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
-		goto out;
+		return;
 
 	arch_spin_lock(&cpu_buffer->lock);
 
 	rb_reset_cpu(cpu_buffer);
 
 	arch_spin_unlock(&cpu_buffer->lock);
-
- out:
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
 /**
-- 
cgit v1.2.3


From b2e7c6ed26e90fab1e5e626071e54e3b9ec9cb5a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2025 14:52:16 -0400
Subject: ring-buffer: Simplify ring_buffer_read_page() with guard()

The function ring_buffer_read_page() had two gotos. One was simply
returning "ret" and the other was unlocking the reader_lock.

There's no reason to use goto to simply return the "ret" variable. Instead
just return the value.

The jump to the unlocking of the reader_lock can be replaced by
guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock).

With these two changes the "ret" variable is no longer used and can be
removed. The return value on non-error is what was read and is stored in
the "read" variable.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250527145216.0187cf36@gandalf.local.home
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 58cca10f482b..931bbcc6640f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6531,38 +6531,37 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	struct buffer_data_page *bpage;
 	struct buffer_page *reader;
 	unsigned long missed_events;
-	unsigned long flags;
 	unsigned int commit;
 	unsigned int read;
 	u64 save_timestamp;
-	int ret = -1;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return -1;
 
 	/*
 	 * If len is not big enough to hold the page header, then
 	 * we can not copy anything.
 	 */
 	if (len <= BUF_PAGE_HDR_SIZE)
-		goto out;
+		return -1;
 
 	len -= BUF_PAGE_HDR_SIZE;
 
 	if (!data_page || !data_page->data)
-		goto out;
+		return -1;
+
 	if (data_page->order != buffer->subbuf_order)
-		goto out;
+		return -1;
 
 	bpage = data_page->data;
 	if (!bpage)
-		goto out;
+		return -1;
 
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 
 	reader = rb_get_reader_page(cpu_buffer);
 	if (!reader)
-		goto out_unlock;
+		return -1;
 
 	event = rb_reader_event(cpu_buffer);
 
@@ -6596,7 +6595,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		if (full &&
 		    (!read || (len < (commit - read)) ||
 		     cpu_buffer->reader_page == cpu_buffer->commit_page))
-			goto out_unlock;
+			return -1;
 
 		if (len > (commit - read))
 			len = (commit - read);
@@ -6605,7 +6604,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		size = rb_event_ts_length(event);
 
 		if (len < size)
-			goto out_unlock;
+			return -1;
 
 		/* save the current timestamp, since the user will need it */
 		save_timestamp = cpu_buffer->read_stamp;
@@ -6663,7 +6662,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		if (reader->real_end)
 			local_set(&bpage->commit, reader->real_end);
 	}
-	ret = read;
 
 	cpu_buffer->lost_events = 0;
 
@@ -6690,11 +6688,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	if (commit < buffer->subbuf_size)
 		memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
 
- out_unlock:
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- out:
-	return ret;
+	return read;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
-- 
cgit v1.2.3


From 60bc720e10eac397b3adae975095df77bc368b88 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2025 12:20:09 -0400
Subject: ring-buffer: Make ring_buffer_{un}map() simpler with guard(mutex)

Convert the taking of the buffer->mutex and the cpu_buffer->mapping_lock
over to guard(mutex) and simplify the ring_buffer_map() and
ring_buffer_unmap() functions.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Link: https://lore.kernel.org/20250527122009.267efb72@gandalf.local.home
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 35 +++++++++++------------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 931bbcc6640f..ef1cca8125df 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7161,36 +7161,34 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned long flags, *subbuf_ids;
-	int err = 0;
+	int err;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return -EINVAL;
 
 	cpu_buffer = buffer->buffers[cpu];
 
-	mutex_lock(&cpu_buffer->mapping_lock);
+	guard(mutex)(&cpu_buffer->mapping_lock);
 
 	if (cpu_buffer->user_mapped) {
 		err = __rb_map_vma(cpu_buffer, vma);
 		if (!err)
 			err = __rb_inc_dec_mapped(cpu_buffer, true);
-		mutex_unlock(&cpu_buffer->mapping_lock);
 		return err;
 	}
 
 	/* prevent another thread from changing buffer/sub-buffer sizes */
-	mutex_lock(&buffer->mutex);
+	guard(mutex)(&buffer->mutex);
 
 	err = rb_alloc_meta_page(cpu_buffer);
 	if (err)
-		goto unlock;
+		return err;
 
 	/* subbuf_ids include the reader while nr_pages does not */
 	subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
 	if (!subbuf_ids) {
 		rb_free_meta_page(cpu_buffer);
-		err = -ENOMEM;
-		goto unlock;
+		return -ENOMEM;
 	}
 
 	atomic_inc(&cpu_buffer->resize_disabled);
@@ -7218,35 +7216,29 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
 		atomic_dec(&cpu_buffer->resize_disabled);
 	}
 
-unlock:
-	mutex_unlock(&buffer->mutex);
-	mutex_unlock(&cpu_buffer->mapping_lock);
-
-	return err;
+	return 0;
 }
 
 int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned long flags;
-	int err = 0;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return -EINVAL;
 
 	cpu_buffer = buffer->buffers[cpu];
 
-	mutex_lock(&cpu_buffer->mapping_lock);
+	guard(mutex)(&cpu_buffer->mapping_lock);
 
 	if (!cpu_buffer->user_mapped) {
-		err = -ENODEV;
-		goto out;
+		return -ENODEV;
 	} else if (cpu_buffer->user_mapped > 1) {
 		__rb_inc_dec_mapped(cpu_buffer, false);
-		goto out;
+		return 0;
 	}
 
-	mutex_lock(&buffer->mutex);
+	guard(mutex)(&buffer->mutex);
 	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	/* This is the last user space mapping */
@@ -7261,12 +7253,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
 	rb_free_meta_page(cpu_buffer);
 	atomic_dec(&cpu_buffer->resize_disabled);
 
-	mutex_unlock(&buffer->mutex);
-
-out:
-	mutex_unlock(&cpu_buffer->mapping_lock);
-
-	return err;
+	return 0;
 }
 
 int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
-- 
cgit v1.2.3


From 99d232804405e35d7a9af1536a057578a1442e81 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2025 14:31:44 -0400
Subject: ring-buffer: Simplify functions with __free(kfree) to free
 allocations

The function rb_allocate_pages() allocates cpu_buffer and on error needs
to free it. It has a single return. Use __free(kfree) and return directly
on errors and have the return use return_ptr(cpu_buffer).

The function alloc_buffer() allocates buffer and on error needs to free
it. It has a single return. Use __free(kfree) and return directly on
errors and have the return use return_ptr(buffer).

The function __rb_map_vma() allocates a temporary array "pages". Have it
use __free() and not worry about freeing it when returning.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250527143144.6edc4625@gandalf.local.home
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ef1cca8125df..295b6fbfa81a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2226,7 +2226,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 static struct ring_buffer_per_cpu *
 rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 {
-	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL;
 	struct ring_buffer_cpu_meta *meta;
 	struct buffer_page *bpage;
 	struct page *page;
@@ -2252,7 +2252,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
 	if (!bpage)
-		goto fail_free_buffer;
+		return NULL;
 
 	rb_check_bpage(cpu_buffer, bpage);
 
@@ -2318,13 +2318,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 		rb_head_page_activate(cpu_buffer);
 	}
 
-	return cpu_buffer;
+	return_ptr(cpu_buffer);
 
  fail_free_reader:
 	free_buffer_page(cpu_buffer->reader_page);
 
- fail_free_buffer:
-	kfree(cpu_buffer);
 	return NULL;
 }
 
@@ -2359,7 +2357,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 					 unsigned long scratch_size,
 					 struct lock_class_key *key)
 {
-	struct trace_buffer *buffer;
+	struct trace_buffer *buffer __free(kfree) = NULL;
 	long nr_pages;
 	int subbuf_size;
 	int bsize;
@@ -2373,7 +2371,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 		return NULL;
 
 	if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
-		goto fail_free_buffer;
+		return NULL;
 
 	buffer->subbuf_order = order;
 	subbuf_size = (PAGE_SIZE << order);
@@ -2472,7 +2470,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
 
 	mutex_init(&buffer->mutex);
 
-	return buffer;
+	return_ptr(buffer);
 
  fail_free_buffers:
 	for_each_buffer_cpu(buffer, cpu) {
@@ -2484,8 +2482,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
  fail_free_cpumask:
 	free_cpumask_var(buffer->cpumask);
 
- fail_free_buffer:
-	kfree(buffer);
 	return NULL;
 }
 
@@ -7057,7 +7053,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
 	unsigned int subbuf_pages, subbuf_order;
-	struct page **pages;
+	struct page **pages __free(kfree) = NULL;
 	int p = 0, s = 0;
 	int err;
 
@@ -7125,10 +7121,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
 		struct page *page;
 		int off = 0;
 
-		if (WARN_ON_ONCE(s >= nr_subbufs)) {
-			err = -EINVAL;
-			goto out;
-		}
+		if (WARN_ON_ONCE(s >= nr_subbufs))
+			return -EINVAL;
 
 		page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
 
@@ -7143,9 +7137,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
 
 	err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
 
-out:
-	kfree(pages);
-
 	return err;
 }
 #else
-- 
cgit v1.2.3