summaryrefslogtreecommitdiff
path: root/tools/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-31 08:52:33 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-31 08:52:33 -0700
commit802f0d58d52e8e34e08718479475ccdff0caffa0 (patch)
tree305f3be98d12b0c6881a6c59eb92e795e6088e51 /tools/lib
parent4e82c87058f45e79eeaa4d5bcc3b38dd3dce7209 (diff)
parent35d13f841a3d8159ef20d5e32a9ed3faa27875bc (diff)
Merge tag 'perf-tools-for-v6.15-2025-03-27' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools
Pull perf tools updates from Namhyung Kim: "perf record: - Introduce latency profiling using scheduler information. The latency profiling is to show impacts on wall-time rather than cpu-time. By tracking context switches, it can weight samples and find which part of the code contributed more to the execution latency. The value (period) of the sample is weighted by dividing it by the number of parallel execution at the moment. The parallelism is tracked in perf report with sched-switch records. This will reduce the portion that are run in parallel and in turn increase the portion of serial executions. For now, it's limited to profile processes, IOW system-wide profiling is not supported. You can add --latency option to enable this. $ perf record --latency -- make -C tools/perf I've run the above command for perf build which adds -j option to make with the number of CPUs in the system internally. Normally it'd show something like below: $ perf report -F overhead,comm ... # # Overhead Command # ........ ............... # 78.97% cc1 6.54% python3 4.21% shellcheck 3.28% ld 1.80% as 1.37% cc1plus 0.80% sh 0.62% clang 0.56% gcc 0.44% perl 0.39% make ... The cc1 takes around 80% of the overhead as it's the actual compiler. However it runs in parallel so its contribution to latency may be less than that. Now, perf report will show both overhead and latency (if --latency was given at record time) like below: $ perf report -s comm ... # # Overhead Latency Command # ........ ........ ............... # 78.97% 48.66% cc1 6.54% 25.68% python3 4.21% 0.39% shellcheck 3.28% 13.70% ld 1.80% 2.56% as 1.37% 3.08% cc1plus 0.80% 0.98% sh 0.62% 0.61% clang 0.56% 0.33% gcc 0.44% 1.71% perl 0.39% 0.83% make ... You can see latency of cc1 goes down to around 50% and python3 and ld contribute a lot more than their overhead. You can use --latency option in perf report to get the same result but ordered by latency. $ perf report --latency -s comm perf report: - As a side effect of the latency profiling work, it adds a new output field 'latency' and a sort key 'parallelism'. The below is a result from my system with 64 CPUs. The build was well-parallelized but contained some serial portions. $ perf report -s parallelism ... # # Overhead Latency Parallelism # ........ ........ ........... # 16.95% 1.54% 62 13.38% 1.24% 61 12.50% 70.47% 1 11.81% 1.06% 63 7.59% 0.71% 60 4.33% 12.20% 2 3.41% 0.33% 59 2.05% 0.18% 64 1.75% 1.09% 9 1.64% 1.85% 5 ... - Support Feodra mini-debuginfo which is a LZMA compressed symbol table inside ".gnu_debugdata" ELF section. perf annotate: - Add --code-with-type option to enable data-type profiling with the usual annotate output. Instead of focusing on data structure, it shows code annotation together with data type it accesses in case the instruction refers to a memory location (and it was able to resolve the target data type). Currently it only works with --stdio. $ perf annotate --stdio --code-with-type ... Percent | Source code & Disassembly of vmlinux for cpu/mem-loads,ldlat=30/pp (18 samples, percent: local period) ---------------------------------------------------------------------------------------------------------------------- : 0 0xffffffff81050610 <__fdget>: 0.00 : ffffffff81050610: callq 0xffffffff81c01b80 <__fentry__> # data-type: (stack operation) 0.00 : ffffffff81050615: pushq %rbp # data-type: (stack operation) 0.00 : ffffffff81050616: movq %rsp, %rbp 0.00 : ffffffff81050619: pushq %r15 # data-type: (stack operation) 0.00 : ffffffff8105061b: pushq %r14 # data-type: (stack operation) 0.00 : ffffffff8105061d: pushq %rbx # data-type: (stack operation) 0.00 : ffffffff8105061e: subq $0x10, %rsp 0.00 : ffffffff81050622: movl %edi, %ebx 0.00 : ffffffff81050624: movq %gs:0x7efc4814(%rip), %rax # 0x14e40 <current_task> # data-type: struct task_struct* +0 0.00 : ffffffff8105062c: movq 0x8d0(%rax), %r14 # data-type: struct task_struct +0x8d0 (files) 0.00 : ffffffff81050633: movl (%r14), %eax # data-type: struct files_struct +0 (count.counter) 0.00 : ffffffff81050636: cmpl $0x1, %eax 0.00 : ffffffff81050639: je 0xffffffff810506a9 <__fdget+0x99> 0.00 : ffffffff8105063b: movq 0x20(%r14), %rcx # data-type: struct files_struct +0x20 (fdt) 0.00 : ffffffff8105063f: movl (%rcx), %eax # data-type: struct fdtable +0 (max_fds) 0.00 : ffffffff81050641: cmpl %ebx, %eax 0.00 : ffffffff81050643: jbe 0xffffffff810506ef <__fdget+0xdf> 0.00 : ffffffff81050649: movl %ebx, %r15d 5.56 : ffffffff8105064c: movq 0x8(%rcx), %rdx # data-type: struct fdtable +0x8 (fd) ... The "# data-type:" part was added with this change. The first few entries are not very interesting. But later you can it accesses a couple of fields in the task_struct, files_struct and fdtable. perf trace: - Support syscall tracing for different ABI. For example it can trace system calls for 32-bit applications on 64-bit kernel transparently. - Add --summary-mode=total option to show global syscall summary. The default is 'thread' to show per-thread syscall summary. Python support: - Add more interfaces to 'perf' module to parse events, and config, enable or disable the event list properly so that it can implement basic functionalities purely in Python. There is an example code for these new interfaces in python/tracepoint.py. - Add mypy and pylint support to enable build time checking. Fix some code based on the findings from these tools. Internals: - Introduce io_dir__readdir() API to make directory traveral (usually for proc or sysfs) efficient with less memory footprint. JSON vendor events: - Add events and metrics for ARM Neoverse N3 and V3 - Update events and metrics on various Intel CPUs - Add/update events for a number of SiFive processors" * tag 'perf-tools-for-v6.15-2025-03-27' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (229 commits) perf bpf-filter: Fix a parsing error with comma perf report: Fix a memory leak for perf_env on AMD perf trace: Fix wrong size to bpf_map__update_elem call perf tools: annotate asm_pure_loop.S perf python: Fix setup.py mypy errors perf test: Address attr.py mypy error perf build: Add pylint build tests perf build: Add mypy build tests perf build: Rename TEST_LOGS to SHELL_TEST_LOGS tools/build: Don't pass test log files to linker perf bench sched pipe: fix enforced blocking reads in worker_thread perf tools: Fix is_compat_mode build break in ppc64 perf build: filter all combinations of -flto for libperl perf vendor events arm64 AmpereOneX: Fix frontend_bound calculation perf vendor events arm64: AmpereOne/AmpereOneX: Mark LD_RETIRED impacted by errata perf trace: Fix evlist memory leak perf trace: Fix BTF memory leak perf trace: Make syscall table stable perf syscalltbl: Mask off ABI type for MIPS system calls perf build: Remove Makefile.syscalls ...
Diffstat (limited to 'tools/lib')
-rw-r--r--tools/lib/api/Makefile2
-rw-r--r--tools/lib/api/io_dir.h105
-rw-r--r--tools/lib/perf/Makefile12
-rw-r--r--tools/lib/perf/cpumap.c8
-rw-r--r--tools/lib/perf/include/perf/cpumap.h3
5 files changed, 115 insertions, 15 deletions
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile
index 7f6396087b46..8665c799e0fa 100644
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -95,7 +95,7 @@ install_lib: $(LIBFILE)
$(call do_install_mkdir,$(libdir_SQ)); \
cp -fpR $(LIBFILE) $(DESTDIR)$(libdir_SQ)
-HDRS := cpu.h debug.h io.h
+HDRS := cpu.h debug.h io.h io_dir.h
FD_HDRS := fd/array.h
FS_HDRS := fs/fs.h fs/tracing_path.h
INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/api
diff --git a/tools/lib/api/io_dir.h b/tools/lib/api/io_dir.h
new file mode 100644
index 000000000000..ef83e967e48c
--- /dev/null
+++ b/tools/lib/api/io_dir.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/*
+ * Lightweight directory reading library.
+ */
+#ifndef __API_IO_DIR__
+#define __API_IO_DIR__
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <linux/limits.h>
+
+#if !defined(SYS_getdents64)
+#if defined(__x86_64__) || defined(__arm__)
+ #define SYS_getdents64 217
+#elif defined(__i386__) || defined(__s390x__) || defined(__sh__)
+ #define SYS_getdents64 220
+#elif defined(__alpha__)
+ #define SYS_getdents64 377
+#elif defined(__mips__)
+ #define SYS_getdents64 308
+#elif defined(__powerpc64__) || defined(__powerpc__)
+ #define SYS_getdents64 202
+#elif defined(__sparc64__) || defined(__sparc__)
+ #define SYS_getdents64 154
+#elif defined(__xtensa__)
+ #define SYS_getdents64 60
+#else
+ #define SYS_getdents64 61
+#endif
+#endif /* !defined(SYS_getdents64) */
+
+static inline ssize_t perf_getdents64(int fd, void *dirp, size_t count)
+{
+#ifdef MEMORY_SANITIZER
+ memset(dirp, 0, count);
+#endif
+ return syscall(SYS_getdents64, fd, dirp, count);
+}
+
+struct io_dirent64 {
+ ino64_t d_ino; /* 64-bit inode number */
+ off64_t d_off; /* 64-bit offset to next structure */
+ unsigned short d_reclen; /* Size of this dirent */
+ unsigned char d_type; /* File type */
+ char d_name[NAME_MAX + 1]; /* Filename (null-terminated) */
+};
+
+struct io_dir {
+ int dirfd;
+ ssize_t available_bytes;
+ struct io_dirent64 *next;
+ struct io_dirent64 buff[4];
+};
+
+static inline void io_dir__init(struct io_dir *iod, int dirfd)
+{
+ iod->dirfd = dirfd;
+ iod->available_bytes = 0;
+}
+
+static inline void io_dir__rewinddir(struct io_dir *iod)
+{
+ lseek(iod->dirfd, 0, SEEK_SET);
+ iod->available_bytes = 0;
+}
+
+static inline struct io_dirent64 *io_dir__readdir(struct io_dir *iod)
+{
+ struct io_dirent64 *entry;
+
+ if (iod->available_bytes <= 0) {
+ ssize_t rc = perf_getdents64(iod->dirfd, iod->buff, sizeof(iod->buff));
+
+ if (rc <= 0)
+ return NULL;
+ iod->available_bytes = rc;
+ iod->next = iod->buff;
+ }
+ entry = iod->next;
+ iod->next = (struct io_dirent64 *)((char *)entry + entry->d_reclen);
+ iod->available_bytes -= entry->d_reclen;
+ return entry;
+}
+
+static inline bool io_dir__is_dir(const struct io_dir *iod, struct io_dirent64 *dent)
+{
+ if (dent->d_type == DT_UNKNOWN) {
+ struct stat st;
+
+ if (fstatat(iod->dirfd, dent->d_name, &st, /*flags=*/0))
+ return false;
+
+ if (S_ISDIR(st.st_mode)) {
+ dent->d_type = DT_DIR;
+ return true;
+ }
+ }
+ return dent->d_type == DT_DIR;
+}
+
+#endif /* __API_IO_DIR__ */
diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
index e9a7ac2c062e..ffcfd777c451 100644
--- a/tools/lib/perf/Makefile
+++ b/tools/lib/perf/Makefile
@@ -41,13 +41,6 @@ libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
TEST_ARGS := $(if $(V),-v)
-# Set compile option CFLAGS
-ifdef EXTRA_CFLAGS
- CFLAGS := $(EXTRA_CFLAGS)
-else
- CFLAGS := -g -Wall
-endif
-
INCLUDES = \
-I$(srctree)/tools/lib/perf/include \
-I$(srctree)/tools/lib/ \
@@ -57,11 +50,12 @@ INCLUDES = \
-I$(srctree)/tools/include/uapi
# Append required CFLAGS
-override CFLAGS += $(EXTRA_WARNINGS)
-override CFLAGS += -Werror -Wall
+override CFLAGS += -g -Werror -Wall
override CFLAGS += -fPIC
override CFLAGS += $(INCLUDES)
override CFLAGS += -fvisibility=hidden
+override CFLAGS += $(EXTRA_WARNINGS)
+override CFLAGS += $(EXTRA_CFLAGS)
all:
diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c
index fcc47214062a..4454a5987570 100644
--- a/tools/lib/perf/cpumap.c
+++ b/tools/lib/perf/cpumap.c
@@ -185,7 +185,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
while (isdigit(*cpu_list)) {
p = NULL;
start_cpu = strtoul(cpu_list, &p, 0);
- if (start_cpu >= INT_MAX
+ if (start_cpu >= INT16_MAX
|| (*p != '\0' && *p != ',' && *p != '-' && *p != '\n'))
goto invalid;
@@ -194,7 +194,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
p = NULL;
end_cpu = strtoul(cpu_list, &p, 0);
- if (end_cpu >= INT_MAX || (*p != '\0' && *p != ',' && *p != '\n'))
+ if (end_cpu >= INT16_MAX || (*p != '\0' && *p != ',' && *p != '\n'))
goto invalid;
if (end_cpu < start_cpu)
@@ -209,7 +209,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
for (; start_cpu <= end_cpu; start_cpu++) {
/* check for duplicates */
for (i = 0; i < nr_cpus; i++)
- if (tmp_cpus[i].cpu == (int)start_cpu)
+ if (tmp_cpus[i].cpu == (int16_t)start_cpu)
goto invalid;
if (nr_cpus == max_entries) {
@@ -219,7 +219,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
goto invalid;
tmp_cpus = tmp;
}
- tmp_cpus[nr_cpus++].cpu = (int)start_cpu;
+ tmp_cpus[nr_cpus++].cpu = (int16_t)start_cpu;
}
if (*p)
++p;
diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h
index 188a667babc6..8c1ab0f9194e 100644
--- a/tools/lib/perf/include/perf/cpumap.h
+++ b/tools/lib/perf/include/perf/cpumap.h
@@ -4,10 +4,11 @@
#include <perf/core.h>
#include <stdbool.h>
+#include <stdint.h>
/** A wrapper around a CPU to avoid confusion with the perf_cpu_map's map's indices. */
struct perf_cpu {
- int cpu;
+ int16_t cpu;
};
struct perf_cache {