From 45d99ea451d0c30bfd4864f0fe485d7dac014902 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 21 Sep 2023 20:54:25 +0800 Subject: ring-buffer: Fix bytes info in per_cpu buffer stats The 'bytes' info in file 'per_cpu/cpu/stats' means the number of bytes in cpu buffer that have not been consumed. However, currently after consuming data by reading file 'trace_pipe', the 'bytes' info was not changed as expected. # cat per_cpu/cpu0/stats entries: 0 overrun: 0 commit overrun: 0 bytes: 568 <--- 'bytes' is problematical !!! oldest event ts: 8651.371479 now ts: 8653.912224 dropped events: 0 read events: 8 The root cause is incorrect stat on cpu_buffer->read_bytes. To fix it: 1. When stat 'read_bytes', account consumed event in rb_advance_reader(); 2. When stat 'entries_bytes', exclude the discarded padding event which is smaller than minimum size because it is invisible to reader. Then use rb_page_commit() instead of BUF_PAGE_SIZE at where accounting for page-based read/remove/overrun. Also correct the comments of ring_buffer_bytes_cpu() in this patch. Link: https://lore.kernel.org/linux-trace-kernel/20230921125425.1708423-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: c64e148a3be3 ("trace: Add ring buffer stats to measure rate of events") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1651edc48d5..28daf0ce95c5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -354,6 +354,11 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + static void free_buffer_page(struct buffer_page *bpage) { free_page((unsigned long)bpage->page); @@ -2003,7 +2008,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } @@ -2367,11 +2372,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) -{ - return local_read(&bpage->page->commit); -} - static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { @@ -2517,7 +2517,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); /* @@ -2660,9 +2660,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); - /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); - /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2676,7 +2673,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure - * that this space is not used again. + * that this space is not used again, and this space will + * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. @@ -2701,6 +2699,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1; + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); @@ -4215,7 +4216,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** - * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ @@ -4723,6 +4724,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) length = rb_event_length(event); cpu_buffer->reader_page->read += length; + cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) @@ -5816,7 +5818,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += BUF_PAGE_SIZE; + cpu_buffer->read_bytes += rb_page_commit(reader); /* swap the pages */ rb_init_page(bpage); -- cgit v1.2.3 From 1e0cb399c7653462d9dadf8ab9425337c355d358 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 29 Sep 2023 18:01:13 -0400 Subject: ring-buffer: Update "shortest_full" in polling It was discovered that the ring buffer polling was incorrectly stating that read would not block, but that's because polling did not take into account that reads will block if the "buffer-percent" was set. Instead, the ring buffer polling would say reads would not block if there was any data in the ring buffer. This was incorrect behavior from a user space point of view. This was fixed by commit 42fb0a1e84ff by having the polling code check if the ring buffer had more data than what the user specified "buffer percent" had. The problem now is that the polling code did not register itself to the writer that it wanted to wait for a specific "full" value of the ring buffer. The result was that the writer would wake the polling waiter whenever there was a new event. The polling waiter would then wake up, see that there's not enough data in the ring buffer to notify user space and then go back to sleep. The next event would wake it up again. Before the polling fix was added, the code would wake up around 100 times for a hackbench 30 benchmark. After the "fix", due to the constant waking of the writer, it would wake up over 11,0000 times! It would never leave the kernel, so the user space behavior was still "correct", but this definitely is not the desired effect. To fix this, have the polling code add what it's waiting for to the "shortest_full" variable, to tell the writer not to wake it up if the buffer is not as full as it expects to be. Note, after this fix, it appears that the waiter is now woken up around 2x the times it was before (~200). This is a tremendous improvement from the 11,000 times, but I will need to spend some time to see why polling is more aggressive in its wakeups than the read blocking code. Link: https://lore.kernel.org/linux-trace-kernel/20230929180113.01c2cae3@rorschach.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark") Reported-by: Julia Lawall Tested-by: Julia Lawall Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 28daf0ce95c5..515cafdb18d9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1137,6 +1137,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, if (full) { poll_wait(filp, &work->full_waiters, poll_table); work->full_waiters_pending = true; + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full) + cpu_buffer->shortest_full = full; } else { poll_wait(filp, &work->waiters, poll_table); work->waiters_pending = true; -- cgit v1.2.3